2 месяцев назад · 5b180c3d60
--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -35,7 +35,6 @@ struct ggml_metal {
 
				     // additional, inference-time compiled pipelines
			
 
				     ggml_metal_pipelines_t pipelines_ext;
			
 
				 
			
 
				-    bool use_bfloat;
			
 
				     bool use_fusion;
			
 
				     bool use_concurrency;
			
 
				     bool use_graph_optimize;
			
@@ -121,11 +120,10 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
			
 
				+    //const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
			
 
				 
			
 
				     res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
			
 
				 
			
 
				-    res->use_bfloat      = props_dev->has_bfloat;
			
 
				     res->use_fusion      = getenv("GGML_METAL_FUSION_DISABLE") == nil;
			
 
				     res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
			
 
				 
			
@@ -147,7 +145,6 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
 
				 
			
 
				     memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
			
 
				 
			
 
				-    GGML_LOG_INFO("%s: use bfloat         = %s\n", __func__, res->use_bfloat         ? "true" : "false");
			
 
				     GGML_LOG_INFO("%s: use fusion         = %s\n", __func__, res->use_fusion         ? "true" : "false");
			
 
				     GGML_LOG_INFO("%s: use concurrency    = %s\n", __func__, res->use_concurrency    ? "true" : "false");
			
 
				     GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
			
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -95,7 +95,9 @@ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder);
 
				 
			
 
				 typedef struct ggml_metal_library * ggml_metal_library_t;
			
 
				 
			
 
				-ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev);
			
 
				+ggml_metal_library_t ggml_metal_library_init            (ggml_metal_device_t dev);
			
 
				+ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose);
			
 
				+
			
 
				 void ggml_metal_library_free(ggml_metal_library_t lib);
			
 
				 
			
 
				 ggml_metal_pipeline_t ggml_metal_library_get_pipeline    (ggml_metal_library_t lib, const char * name);
			
@@ -193,6 +195,7 @@ struct ggml_metal_device_props {
 
				     bool has_simdgroup_mm;
			
 
				     bool has_unified_memory;
			
 
				     bool has_bfloat;
			
 
				+    bool has_tensor;
			
 
				     bool use_residency_sets;
			
 
				     bool use_shared_buffers;
			
 
				 
			
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -21,8 +21,9 @@
 
				 #define GGML_METAL_HAS_RESIDENCY_SETS 1
			
 
				 #endif
			
 
				 
			
 
				-// overload of MTLGPUFamilyMetal3 (not available in some environments)
			
 
				+// overload of MTLGPUFamilyMetalX (not available in some environments)
			
 
				 static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
			
 
				+static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
			
 
				 
			
 
				 // virtual address for GPU memory allocations
			
 
				 static atomic_uintptr_t g_addr_device = 0x000000400ULL;
			
@@ -261,6 +262,10 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
 
				                     [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
			
 
				                 }
			
 
				 
			
 
				+                if (ggml_metal_device_get_props(dev)->has_tensor) {
			
 
				+                    [prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
			
 
				+                }
			
 
				+
			
 
				 #if GGML_METAL_EMBED_LIBRARY
			
 
				                 [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
			
 
				 #endif
			
@@ -298,6 +303,72 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
 
				     return res;
			
 
				 }
			
 
				 
			
 
				+ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
			
 
				+    if (source == NULL) {
			
 
				+        GGML_LOG_ERROR("%s: source is NULL\n", __func__);
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    id<MTLDevice> device = ggml_metal_device_get_obj(dev);
			
 
				+    id<MTLLibrary> library = nil;
			
 
				+    NSError * error = nil;
			
 
				+
			
 
				+    const int64_t t_start = ggml_time_us();
			
 
				+
			
 
				+    NSString * src = [[NSString alloc] initWithBytes:source
			
 
				+                                              length:strlen(source)
			
 
				+                                            encoding:NSUTF8StringEncoding];
			
 
				+    if (!src) {
			
 
				+        GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    @autoreleasepool {
			
 
				+        NSMutableDictionary * prep = [NSMutableDictionary dictionary];
			
 
				+
			
 
				+        MTLCompileOptions * options = [MTLCompileOptions new];
			
 
				+        options.preprocessorMacros = prep;
			
 
				+
			
 
				+        library = [device newLibraryWithSource:src options:options error:&error];
			
 
				+        if (error) {
			
 
				+            if (verbose) {
			
 
				+                GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
			
 
				+            } else {
			
 
				+                GGML_LOG_ERROR("%s: error compiling source\n", __func__);
			
 
				+            }
			
 
				+            library = nil;
			
 
				+        }
			
 
				+
			
 
				+        [options release];
			
 
				+    }
			
 
				+
			
 
				+    [src release];
			
 
				+
			
 
				+    if (!library) {
			
 
				+        if (verbose) {
			
 
				+            GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
			
 
				+        }
			
 
				+
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    if (verbose) {
			
 
				+        GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
			
 
				+    }
			
 
				+
			
 
				+    ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
			
 
				+    if (!res) {
			
 
				+        GGML_LOG_ERROR("%s: calloc failed\n", __func__);
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    res->obj       = library;
			
 
				+    res->device    = device;
			
 
				+    res->pipelines = ggml_metal_pipelines_init();
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				 void ggml_metal_library_free(ggml_metal_library_t lib) {
			
 
				     if (!lib) {
			
 
				         return;
			
@@ -345,9 +416,9 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
 
				         if (!mtl_function) {
			
 
				             ggml_critical_section_end();
			
 
				 
			
 
				-            GGML_LOG_ERROR("%s: error: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
			
 
				+            GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
			
 
				             if (error) {
			
 
				-                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
			
 
				+                GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
			
 
				             }
			
 
				 
			
 
				             return nil;
			
@@ -355,13 +426,21 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
 
				 
			
 
				         res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
			
 
				 
			
 
				-        ggml_metal_pipelines_add(lib->pipelines, name, res);
			
 
				-
			
 
				         [mtl_function release];
			
 
				 
			
 
				         GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
			
 
				                 (int) res->obj.maxTotalThreadsPerThreadgroup,
			
 
				                 (int) res->obj.threadExecutionWidth);
			
 
				+
			
 
				+        if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
			
 
				+            ggml_critical_section_end();
			
 
				+
			
 
				+            GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
			
 
				+
			
 
				+            return nil;
			
 
				+        }
			
 
				+
			
 
				+        ggml_metal_pipelines_add(lib->pipelines, name, res);
			
 
				     }
			
 
				 
			
 
				     ggml_critical_section_end();
			
@@ -469,6 +548,126 @@ ggml_metal_device_t ggml_metal_device_init(void) {
 
				 
			
 
				             dev->props.has_bfloat  = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
			
 
				             dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
			
 
				+            if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
			
 
				+                dev->props.has_bfloat = false;
			
 
				+            }
			
 
				+
			
 
				+            dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
			
 
				+            if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
			
 
				+                dev->props.has_tensor = false;
			
 
				+            }
			
 
				+
			
 
				+            // note: disable the tensor API by default for old chips because with the current implementation it is not useful
			
 
				+            // - M2 Ultra:   ~5% slower
			
 
				+            // - M4, M4 Max: no significant difference
			
 
				+            //
			
 
				+            // TODO: try to update the tensor API kernels to at least match the simdgroup performance
			
 
				+            if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
			
 
				+                ![[dev->mtl_device name] containsString:@"M5"] &&
			
 
				+                ![[dev->mtl_device name] containsString:@"M6"]) {
			
 
				+                GGML_LOG_WARN("%s: tensor API disabled for pre-M5 device\n", __func__);
			
 
				+                dev->props.has_tensor = false;
			
 
				+            }
			
 
				+
			
 
				+            // double-check that the tensor API compiles
			
 
				+            if (dev->props.has_tensor) {
			
 
				+                const char * src_tensor_f16 = "\n"
			
 
				+                    "#include <metal_stdlib> \n"
			
 
				+                    "#include <metal_tensor> \n"
			
 
				+                    "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
			
 
				+                    " \n"
			
 
				+                    "using namespace metal; \n"
			
 
				+                    "using namespace mpp::tensor_ops; \n"
			
 
				+                    " \n"
			
 
				+                    "kernel void dummy_kernel( \n"
			
 
				+                    "    tensor<device  half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
			
 
				+                    "    tensor<device  half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
			
 
				+                    "    device float * C [[buffer(2)]], \n"
			
 
				+                    "    uint2 tgid [[threadgroup_position_in_grid]]) \n"
			
 
				+                    "{ \n"
			
 
				+                    "    auto tA = A.slice(0, (int)tgid.y); \n"
			
 
				+                    "    auto tB = B.slice((int)tgid.x, 0); \n"
			
 
				+                    " \n"
			
 
				+                    "    matmul2d< \n"
			
 
				+                    "        matmul2d_descriptor(8, 8, dynamic_extent), \n"
			
 
				+                    "        execution_simdgroups<4>> mm; \n"
			
 
				+                    " \n"
			
 
				+                    "    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
			
 
				+                    " \n"
			
 
				+                    "    auto sA = tA.slice(0, 0); \n"
			
 
				+                    "    auto sB = tB.slice(0, 0); \n"
			
 
				+                    "    mm.run(sB, sA, cT); \n"
			
 
				+                    " \n"
			
 
				+                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
			
 
				+                    " \n"
			
 
				+                    "    cT.store(tC); \n"
			
 
				+                    "}";
			
 
				+
			
 
				+                GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
			
 
				+                ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
			
 
				+                if (lib == NULL) {
			
 
				+                    GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
			
 
				+                    dev->props.has_tensor = false;
			
 
				+                } else {
			
 
				+                    ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
			
 
				+                    if (!ppl) {
			
 
				+                        GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
			
 
				+                        dev->props.has_tensor = false;
			
 
				+                    }
			
 
				+
			
 
				+                    ggml_metal_library_free(lib);
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // try to compile a dummy kernel to determine if the tensor API is supported for bfloat
			
 
				+            if (dev->props.has_tensor && dev->props.has_bfloat) {
			
 
				+                const char * src_tensor_bf16 = "\n"
			
 
				+                    "#include <metal_stdlib> \n"
			
 
				+                    "#include <metal_tensor> \n"
			
 
				+                    "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
			
 
				+                    " \n"
			
 
				+                    "using namespace metal; \n"
			
 
				+                    "using namespace mpp::tensor_ops; \n"
			
 
				+                    " \n"
			
 
				+                    "kernel void dummy_kernel( \n"
			
 
				+                    "    tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
			
 
				+                    "    tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
			
 
				+                    "    device float * C [[buffer(2)]], \n"
			
 
				+                    "    uint2 tgid [[threadgroup_position_in_grid]]) \n"
			
 
				+                    "{ \n"
			
 
				+                    "    auto tA = A.slice(0, (int)tgid.y); \n"
			
 
				+                    "    auto tB = B.slice((int)tgid.x, 0); \n"
			
 
				+                    " \n"
			
 
				+                    "    matmul2d< \n"
			
 
				+                    "        matmul2d_descriptor(8, 8, dynamic_extent), \n"
			
 
				+                    "        execution_simdgroups<4>> mm; \n"
			
 
				+                    " \n"
			
 
				+                    "    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
			
 
				+                    " \n"
			
 
				+                    "    auto sA = tA.slice(0, 0); \n"
			
 
				+                    "    auto sB = tB.slice(0, 0); \n"
			
 
				+                    "    mm.run(sB, sA, cT); \n"
			
 
				+                    " \n"
			
 
				+                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
			
 
				+                    " \n"
			
 
				+                    "    cT.store(tC); \n"
			
 
				+                    "}";
			
 
				+
			
 
				+                GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
			
 
				+                ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
			
 
				+                if (lib == NULL) {
			
 
				+                    GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
			
 
				+                    dev->props.has_bfloat = false;
			
 
				+                } else {
			
 
				+                    ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
			
 
				+                    if (!ppl) {
			
 
				+                        GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
			
 
				+                        dev->props.has_bfloat = false;
			
 
				+                    }
			
 
				+
			
 
				+                    ggml_metal_library_free(lib);
			
 
				+                }
			
 
				+            }
			
 
				 
			
 
				             dev->props.use_residency_sets = true;
			
 
				 #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
			
@@ -476,7 +675,6 @@ ggml_metal_device_t ggml_metal_device_init(void) {
 
				 #endif
			
 
				 
			
 
				             dev->props.use_shared_buffers = dev->props.has_unified_memory;
			
 
				-
			
 
				             if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
			
 
				                 dev->props.use_shared_buffers = false;
			
 
				             }
			
@@ -529,6 +727,7 @@ ggml_metal_device_t ggml_metal_device_init(void) {
 
				             GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm        ? "true" : "false");
			
 
				             GGML_LOG_INFO("%s: has unified memory    = %s\n", __func__, dev->props.has_unified_memory      ? "true" : "false");
			
 
				             GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, dev->props.has_bfloat              ? "true" : "false");
			
 
				+            GGML_LOG_INFO("%s: has tensor            = %s\n", __func__, dev->props.has_tensor              ? "true" : "false");
			
 
				             GGML_LOG_INFO("%s: use residency sets    = %s\n", __func__, dev->props.use_residency_sets      ? "true" : "false");
			
 
				             GGML_LOG_INFO("%s: use shared buffers    = %s\n", __func__, dev->props.use_shared_buffers      ? "true" : "false");
			
 
				 
			
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -9,6 +9,12 @@ __embed_ggml-common.h__
 
				 
			
 
				 #include <metal_stdlib>
			
 
				 
			
 
				+#ifdef GGML_METAL_HAS_TENSOR
			
 
				+#include <metal_tensor>
			
 
				+
			
 
				+#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h>
			
 
				+#endif
			
 
				+
			
 
				 using namespace metal;
			
 
				 
			
 
				 #define MAX(x, y) ((x) > (y) ? (x) : (y))
			
@@ -1742,7 +1748,7 @@ kernel void kernel_op_sum_f32(
 
				 
			
 
				     float sumf = 0;
			
 
				 
			
 
				-    for (int64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
			
 
				+    for (uint64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
			
 
				         sumf += src0[i0];
			
 
				     }
			
 
				 
			
@@ -5467,6 +5473,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_at
 
				 
			
 
				 #undef FA_TYPES
			
 
				 #undef FA_TYPES_BF
			
 
				+#undef FA_TYPES_F32
			
 
				 
			
 
				 constant bool FC_flash_attn_ext_vec_has_mask  [[function_constant(FC_FLASH_ATTN_EXT_VEC + 0)]];
			
 
				 constant bool FC_flash_attn_ext_vec_has_sinks [[function_constant(FC_FLASH_ATTN_EXT_VEC + 1)]];
			
@@ -6088,6 +6095,7 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk576_dv512")]] kernel flas
 
				 template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES,     block_q8_0, 8, dequantize_q8_0_t4, block_q8_0,  8, dequantize_q8_0_t4, 576, 512, 2>;
			
 
				 
			
 
				 #undef FA_TYPES
			
 
				+#undef FA_TYPES_F32
			
 
				 
			
 
				 constant int32_t FC_flash_attn_ext_vec_reduce_DV  [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 0)]];
			
 
				 constant int32_t FC_flash_attn_ext_vec_reduce_NWG [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 1)]];
			
@@ -8141,17 +8149,6 @@ kernel void kernel_set_rows_f(
 
				 constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
			
 
				 constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
			
 
				 
			
 
				-#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
			
 
				-#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
			
 
				-#define BLOCK_SIZE_K 32
			
 
				-#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
			
 
				-#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
			
 
				-#define THREAD_PER_BLOCK 128
			
 
				-#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers
			
 
				-#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers
			
 
				-#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
			
 
				-#define SG_MAT_ROW 8
			
 
				-
			
 
				 // each block_q contains 16*nl weights
			
 
				 template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
			
 
				 kernel void kernel_mul_mm(
			
@@ -8167,18 +8164,48 @@ kernel void kernel_mul_mm(
 
				     threadgroup S0 * sa = (threadgroup S0 *)(shmem);
			
 
				     threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
			
 
				 
			
 
				-    const int r0 = tgpig.y;
			
 
				-    const int r1 = tgpig.x;
			
 
				+    threadgroup float * sc = (threadgroup float *)(shmem);
			
 
				+
			
 
				+    constexpr int NR0 = 64;
			
 
				+    constexpr int NR1 = 32;
			
 
				+
			
 
				+    constexpr int NK  = 32;
			
 
				+    constexpr int NL0 = NK/16;
			
 
				+    constexpr int NL1 = NK/8;
			
 
				+
			
 
				     const int im = tgpig.z;
			
 
				+    const int r0 = tgpig.y*NR0;
			
 
				+    const int r1 = tgpig.x*NR1;
			
 
				 
			
 
				     // if this block is of 64x32 shape or smaller
			
 
				-    const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
			
 
				-    const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
			
 
				+    const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0;
			
 
				+    const short nr1 = (args.ne1 - r1 < NR1) ? (args.ne1 - r1) : NR1;
			
 
				 
			
 
				     // a thread shouldn't load data outside of the matrix
			
 
				-    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
			
 
				-    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
			
 
				+    const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63
			
 
				+    const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31
			
 
				+
			
 
				+    const short il0 = (tiitg % NL0);
			
 
				+
			
 
				+    short il = il0;
			
 
				+
			
 
				+    const int i12 = im%args.ne12;
			
 
				+    const int i13 = im/args.ne12;
			
 
				+
			
 
				+    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
			
 
				+    const short    offset1 = il0/nl;
			
 
				+
			
 
				+    device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
			
 
				+
			
 
				+    const short iy = 8*(tiitg % NL1);
			
 
				+
			
 
				+    device const T1 * y = (device const T1 *)(src1
			
 
				+        + args.nb13*i13
			
 
				+        + args.nb12*i12
			
 
				+        + args.nb11*(r1 + lr1)
			
 
				+        + args.nb10*iy);
			
 
				 
			
 
				+#ifndef GGML_METAL_HAS_TENSOR
			
 
				     S0_8x8 ma[4];
			
 
				     S1_8x8 mb[2];
			
 
				 
			
@@ -8187,36 +8214,104 @@ kernel void kernel_mul_mm(
 
				     for (short i = 0; i < 8; i++){
			
 
				         mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
			
 
				     }
			
 
				+#else
			
 
				+    auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK,  NR0));
			
 
				+    auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
			
 
				 
			
 
				-    short il = (tiitg % THREAD_PER_ROW);
			
 
				+    mpp::tensor_ops::matmul2d<
			
 
				+        mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
			
 
				+        execution_simdgroups<4>> mm;
			
 
				 
			
 
				-    const int i12 = im%args.ne12;
			
 
				-    const int i13 = im/args.ne12;
			
 
				+    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
			
 
				+#endif
			
 
				 
			
 
				-    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
			
 
				-    const short    offset1 = il/nl;
			
 
				+    for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
			
 
				+#ifndef GGML_METAL_HAS_TENSOR
			
 
				+        // load data and store to threadgroup memory
			
 
				+        if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
			
 
				+            threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				-    device const block_q * x = (device const block_q *)(src0
			
 
				-        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
			
 
				+            // no need for dequantization
			
 
				+            for (short i = 0; i < 16; i++) {
			
 
				+                const short sx = 2*il0 + i/8;
			
 
				+                const short sy = (tiitg/NL0)/8;
			
 
				 
			
 
				-    const short iy = (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL));
			
 
				+              //const short lx = i%8;
			
 
				+              //const short ly = (tiitg/NL0)%8;
			
 
				+                const short lx = (tiitg/NL0)%8;
			
 
				+                const short ly = i%8;
			
 
				 
			
 
				-    device const T1 * y = (device const T1 *)(src1
			
 
				-        + args.nb13*i13
			
 
				-        + args.nb12*i12
			
 
				-        + args.nb11*(r1*BLOCK_SIZE_N + thread_col)
			
 
				-        + args.nb10*iy);
			
 
				+                const short ib = 8*sx + sy;
			
 
				+
			
 
				+                *(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
			
 
				+            }
			
 
				+        } else {
			
 
				+            S0_4x4 temp_a;
			
 
				+            dequantize_func(x, il, temp_a);
			
 
				+
			
 
				+            threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				-    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
			
 
				+            FOR_UNROLL (short i = 0; i < 16; i++) {
			
 
				+                const short sx = 2*il0 + i/8;
			
 
				+                const short sy = (tiitg/NL0)/8;
			
 
				+
			
 
				+              //const short lx = i%8;
			
 
				+              //const short ly = (tiitg/NL0)%8;
			
 
				+                const short lx = (tiitg/NL0)%8;
			
 
				+                const short ly = i%8;
			
 
				+
			
 
				+                const short ib = 8*sx + sy;
			
 
				+
			
 
				+                // NOTE: this is massively slower.. WTF?
			
 
				+                //sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4];
			
 
				+
			
 
				+                *(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4];
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (FC_mul_mm_bc_inp) {
			
 
				+            for (short i = 0; i < 8; ++i) {
			
 
				+                const short sx = (tiitg%NL1);
			
 
				+                const short sy = (tiitg/NL1)/8;
			
 
				+
			
 
				+                const short lx = i;
			
 
				+                const short ly = (tiitg/NL1)%8;
			
 
				+              //const short lx = (tiitg/NL1)%8;
			
 
				+              //const short ly = i;
			
 
				+
			
 
				+                const short ib = 4*sx + sy;
			
 
				+
			
 
				+                *(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
			
 
				+            }
			
 
				+        } else {
			
 
				+            const short sx = (tiitg%NL1);
			
 
				+            const short sy = (tiitg/NL1)/8;
			
 
				+
			
 
				+            const short dx = sx;
			
 
				+            const short dy = sy;
			
 
				+
			
 
				+            const short ly = (tiitg/NL1)%8;
			
 
				+
			
 
				+            const short ib = 4*sx + sy;
			
 
				+
			
 
				+            *(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
			
 
				+        }
			
 
				+#else
			
 
				         // load data and store to threadgroup memory
			
 
				         if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
			
 
				             threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				             // no need for dequantization
			
 
				             for (short i = 0; i < 16; i++) {
			
 
				-                *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
			
 
				-                +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
			
 
				-                +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = loop_k + 16*il + i < args.ne00 ? ((device T0 *) x)[i] : 0;
			
 
				+                const short sx = 2*il0 + i/8;
			
 
				+                const short sy = (tiitg/NL0)/8;
			
 
				+
			
 
				+                const short lx = i%8;
			
 
				+                const short ly = (tiitg/NL0)%8;
			
 
				+                //const short lx = (tiitg/NL0)%8;
			
 
				+                //const short ly = i%8;
			
 
				+
			
 
				+                *(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
			
 
				             }
			
 
				         } else {
			
 
				             S0_4x4 temp_a;
			
@@ -8225,91 +8320,135 @@ kernel void kernel_mul_mm(
 
				             threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				             FOR_UNROLL (short i = 0; i < 16; i++) {
			
 
				-                *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
			
 
				-                +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
			
 
				-                +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
			
 
				+                const short sx = 2*il0 + i/8;
			
 
				+                const short sy = (tiitg/NL0)/8;
			
 
				+
			
 
				+                const short lx = i%8;
			
 
				+                const short ly = (tiitg/NL0)%8;
			
 
				+                //const short lx = (tiitg/NL0)%8;
			
 
				+                //const short ly = i%8;
			
 
				+
			
 
				+                *(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
			
 
				             }
			
 
				         }
			
 
				 
			
 
				         if (FC_mul_mm_bc_inp) {
			
 
				             for (short i = 0; i < 8; ++i) {
			
 
				-                sb[32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL) + i] = loop_k + iy + i < args.ne00 ? (S1) ((device T1 *) y)[i] : 0;
			
 
				+                const short sx = (tiitg%NL1);
			
 
				+                const short sy = (tiitg/NL1)/8;
			
 
				+
			
 
				+                const short lx = i;
			
 
				+                const short ly = (tiitg/NL1)%8;
			
 
				+                //const short lx = (tiitg/NL1)%8;
			
 
				+                //const short ly = i;
			
 
				+
			
 
				+                *(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
			
 
				             }
			
 
				         } else {
			
 
				-            *(threadgroup S1_2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = (S1_2x4)(*((device T1_2x4 *) y));
			
 
				+            const short sx = (tiitg%NL1);
			
 
				+            const short sy = (tiitg/NL1)/8;
			
 
				+
			
 
				+            //const short lx = i;
			
 
				+            const short ly = (tiitg/NL1)%8;
			
 
				+            //const short lx = (tiitg/NL1)%8;
			
 
				+            //const short ly = i;
			
 
				+
			
 
				+            *(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
			
 
				         }
			
 
				+#endif
			
 
				 
			
 
				         il = (il + 2 < nl) ? il + 2 : il % 2;
			
 
				         x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
			
 
				-        y += BLOCK_SIZE_K;
			
 
				+
			
 
				+        y += NK;
			
 
				 
			
 
				         threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				+#ifndef GGML_METAL_HAS_TENSOR
			
 
				         // load matrices from threadgroup memory and conduct outer products
			
 
				-        threadgroup const S0 * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
			
 
				-        threadgroup const S1 * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
			
 
				+        threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
			
 
				+        threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
			
 
				 
			
 
				-        #pragma unroll(4)
			
 
				-        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
			
 
				+        FOR_UNROLL (short ik = 0; ik < NK/8; ik++) {
			
 
				             simdgroup_barrier(mem_flags::mem_none);
			
 
				 
			
 
				-            #pragma unroll(4)
			
 
				-            for (short i = 0; i < 4; i++) {
			
 
				-                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
			
 
				+            FOR_UNROLL (short i = 0; i < 4; i++) {
			
 
				+                simdgroup_load(ma[i], lsma + 64*i, 8, 0, false);
			
 
				             }
			
 
				 
			
 
				-            #pragma unroll(2)
			
 
				-            for (short i = 0; i < 2; i++) {
			
 
				-                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
			
 
				+            simdgroup_barrier(mem_flags::mem_none);
			
 
				+
			
 
				+            FOR_UNROLL (short i = 0; i < 2; i++) {
			
 
				+                simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false);
			
 
				             }
			
 
				 
			
 
				             simdgroup_barrier(mem_flags::mem_none);
			
 
				 
			
 
				-            #pragma unroll(8)
			
 
				-            for (short i = 0; i < 8; i++){
			
 
				+            FOR_UNROLL (short i = 0; i < 8; i++){
			
 
				                 simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
			
 
				             }
			
 
				 
			
 
				-            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
			
 
				-            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
			
 
				+            lsma += 8*64;
			
 
				+            lsmb += 4*64;
			
 
				         }
			
 
				+#else
			
 
				+        auto sA = tA.slice(0, 0);
			
 
				+        auto sB = tB.slice(0, 0);
			
 
				+
			
 
				+        mm.run(sB, sA, cT);
			
 
				+#endif
			
 
				     }
			
 
				 
			
 
				-    if (!FC_mul_mm_bc_out || ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1)) {
			
 
				+    if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) {
			
 
				         // if no bounds checks on the output are needed, we can directly write to device memory
			
 
				+#ifdef GGML_METAL_HAS_TENSOR
			
 
				+        device float * C = (device float *) dst +
			
 
				+            r0 + \
			
 
				+            r1 * args.ne0 + im*args.ne1*args.ne0;
			
 
				+
			
 
				+        auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(args.ne0, NR1));
			
 
				+        cT.store(tC);
			
 
				+#else
			
 
				         device float * C = (device float *) dst +
			
 
				-            (BLOCK_SIZE_M * r0 + 32*(sgitg &  1)) + \
			
 
				-            (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
			
 
				+            (r0 + 32*(sgitg &  1)) + \
			
 
				+            (r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
			
 
				 
			
 
				         for (short i = 0; i < 8; i++) {
			
 
				-            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
			
 
				+            simdgroup_store(mc[i], C + 8*(i%4) + 8*args.ne0*(i/4), args.ne0, 0, false);
			
 
				         }
			
 
				+#endif
			
 
				     } else {
			
 
				         // block is smaller than 64x32, we should avoid writing data outside of the matrix
			
 
				         threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				-        threadgroup float * temp_str = ((threadgroup float *) shmem) \
			
 
				-                                     + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
			
 
				+
			
 
				+        threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
			
 
				+
			
 
				+#ifdef GGML_METAL_HAS_TENSOR
			
 
				+        auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
			
 
				+        cT.store(tC);
			
 
				+#else
			
 
				         for (short i = 0; i < 8; i++) {
			
 
				-            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
			
 
				+            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
			
 
				         }
			
 
				+#endif
			
 
				 
			
 
				         threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				         if (sgitg == 0) {
			
 
				-            for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
			
 
				-                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.ne0 + im*args.ne1*args.ne0;
			
 
				+            for (int j = tiitg; j < nr1; j += NR1) {
			
 
				+                device float  * D  = (device float  *) dst + r0 + (r1 + j)*args.ne0 + im*args.ne1*args.ne0;
			
 
				                 device float4 * D4 = (device float4 *) D;
			
 
				 
			
 
				-                threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
			
 
				+                threadgroup float  * C  = temp_str + (j*NR0);
			
 
				                 threadgroup float4 * C4 = (threadgroup float4 *) C;
			
 
				 
			
 
				                 int i = 0;
			
 
				-                for (; i < n_rows/4; i++) {
			
 
				+                for (; i < nr0/4; i++) {
			
 
				                     *(D4 + i) = *(C4 + i);
			
 
				                 }
			
 
				 
			
 
				                 i *= 4;
			
 
				-                for (; i < n_rows; i++) {
			
 
				+                for (; i < nr0; i++) {
			
 
				                     *(D + i) = *(C + i);
			
 
				                 }
			
 
				             }
			
@@ -8394,55 +8533,55 @@ kernel void kernel_mul_mm_id(
 
				         ushort tiitg[[thread_index_in_threadgroup]],
			
 
				         ushort tiisg[[thread_index_in_simdgroup]],
			
 
				         ushort sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				-
			
 
				     threadgroup S0 * sa = (threadgroup S0 *)(shmem);
			
 
				     threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
			
 
				 
			
 
				-    const int r0 = tgpig.y;
			
 
				-    const int r1 = tgpig.x;
			
 
				+    threadgroup float * sc = (threadgroup float *)(shmem);
			
 
				+
			
 
				+    constexpr int NR0 = 64;
			
 
				+    constexpr int NR1 = 32;
			
 
				+
			
 
				+    constexpr int NK  = 32;
			
 
				+    constexpr int NL0 = NK/16;
			
 
				+    constexpr int NL1 = NK/8;
			
 
				+
			
 
				     const int im = tgpig.z; // expert
			
 
				+    const int r0 = tgpig.y*NR0;
			
 
				+    const int r1 = tgpig.x*NR1;
			
 
				 
			
 
				     device const uint32_t * tpe_u32 = (device const uint32_t *) (htpe);
			
 
				     device const int32_t  * ids_i32 = (device const int32_t  *) (hids);
			
 
				 
			
 
				     const int32_t neh1 = tpe_u32[im];
			
 
				 
			
 
				-    if (r1*BLOCK_SIZE_N >= neh1) {
			
 
				+    if (r1 >= neh1) {
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     // if this block is of 64x32 shape or smaller
			
 
				-    const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
			
 
				-    const short n_cols = (    neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (    neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
			
 
				+    const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0;
			
 
				+    const short nr1 = (    neh1 - r1 < NR1) ? (    neh1 - r1) : NR1;
			
 
				 
			
 
				     // a thread shouldn't load data outside of the matrix
			
 
				-    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
			
 
				-    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
			
 
				+    const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63
			
 
				+    const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31
			
 
				 
			
 
				-    S0_8x8 ma[4];
			
 
				-    S1_8x8 mb[2];
			
 
				+    const short il0 = (tiitg % NL0);
			
 
				 
			
 
				-    simdgroup_float8x8 mc[8];
			
 
				+    short il = il0;
			
 
				 
			
 
				-    for (short i = 0; i < 8; i++){
			
 
				-        mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
			
 
				-    }
			
 
				-
			
 
				-    short il = (tiitg % THREAD_PER_ROW);
			
 
				-
			
 
				-    const int id = ids_i32[im*args.ne21 + r1*BLOCK_SIZE_N + thread_col];
			
 
				+    const int id = ids_i32[im*args.ne21 + r1 + lr1];
			
 
				 
			
 
				     const short i11 = (id % args.ne20) % args.ne11;
			
 
				     const short i12 = (id / args.ne20);
			
 
				     const short i13 = 0;
			
 
				 
			
 
				     const uint64_t offset0 = im*args.nb02 + i13*args.nb03;
			
 
				-    const short    offset1 = il/nl;
			
 
				+    const short    offset1 = il0/nl;
			
 
				 
			
 
				-    device const block_q * x = (device const block_q *)(src0
			
 
				-        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
			
 
				+    device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
			
 
				 
			
 
				-    const short iy = (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL));
			
 
				+    const short iy = 8*(tiitg % NL1);
			
 
				 
			
 
				     device const T1 * y = (device const T1 *)(src1
			
 
				         + args.nb13*i13
			
@@ -8450,16 +8589,113 @@ kernel void kernel_mul_mm_id(
 
				         + args.nb11*i11
			
 
				         + args.nb10*iy);
			
 
				 
			
 
				-    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
			
 
				+#ifndef GGML_METAL_HAS_TENSOR
			
 
				+    S0_8x8 ma[4];
			
 
				+    S1_8x8 mb[2];
			
 
				+
			
 
				+    simdgroup_float8x8 mc[8];
			
 
				+
			
 
				+    for (short i = 0; i < 8; i++){
			
 
				+        mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
			
 
				+    }
			
 
				+#else
			
 
				+    auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK,  NR0));
			
 
				+    auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
			
 
				+
			
 
				+    mpp::tensor_ops::matmul2d<
			
 
				+        mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
			
 
				+        execution_simdgroups<4>> mm;
			
 
				+
			
 
				+    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
			
 
				+#endif
			
 
				+
			
 
				+    for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
			
 
				+#ifndef GGML_METAL_HAS_TENSOR
			
 
				+        // load data and store to threadgroup memory
			
 
				+        if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
			
 
				+            threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+            // no need for dequantization
			
 
				+            for (short i = 0; i < 16; i++) {
			
 
				+                const short sx = 2*il0 + i/8;
			
 
				+                const short sy = (tiitg/NL0)/8;
			
 
				+
			
 
				+              //const short lx = i%8;
			
 
				+              //const short ly = (tiitg/NL0)%8;
			
 
				+                const short lx = (tiitg/NL0)%8;
			
 
				+                const short ly = i%8;
			
 
				+
			
 
				+                const short ib = 8*sx + sy;
			
 
				+
			
 
				+                *(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
			
 
				+            }
			
 
				+        } else {
			
 
				+            S0_4x4 temp_a;
			
 
				+            dequantize_func(x, il, temp_a);
			
 
				+
			
 
				+            threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+            FOR_UNROLL (short i = 0; i < 16; i++) {
			
 
				+                const short sx = 2*il0 + i/8;
			
 
				+                const short sy = (tiitg/NL0)/8;
			
 
				+
			
 
				+              //const short lx = i%8;
			
 
				+              //const short ly = (tiitg/NL0)%8;
			
 
				+                const short lx = (tiitg/NL0)%8;
			
 
				+                const short ly = i%8;
			
 
				+
			
 
				+                const short ib = 8*sx + sy;
			
 
				+
			
 
				+                // NOTE: this is massively slower.. WTF?
			
 
				+                //sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4];
			
 
				+
			
 
				+                *(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4];
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (FC_mul_mm_bc_inp) {
			
 
				+            for (short i = 0; i < 8; ++i) {
			
 
				+                const short sx = (tiitg%NL1);
			
 
				+                const short sy = (tiitg/NL1)/8;
			
 
				+
			
 
				+                const short lx = i;
			
 
				+                const short ly = (tiitg/NL1)%8;
			
 
				+              //const short lx = (tiitg/NL1)%8;
			
 
				+              //const short ly = i;
			
 
				+
			
 
				+                const short ib = 4*sx + sy;
			
 
				+
			
 
				+                *(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
			
 
				+            }
			
 
				+        } else {
			
 
				+            const short sx = (tiitg%NL1);
			
 
				+            const short sy = (tiitg/NL1)/8;
			
 
				+
			
 
				+            const short dx = sx;
			
 
				+            const short dy = sy;
			
 
				+
			
 
				+            const short ly = (tiitg/NL1)%8;
			
 
				+
			
 
				+            const short ib = 4*sx + sy;
			
 
				+
			
 
				+            *(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
			
 
				+        }
			
 
				+#else
			
 
				         // load data and store to threadgroup memory
			
 
				         if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
			
 
				             threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				             // no need for dequantization
			
 
				             for (short i = 0; i < 16; i++) {
			
 
				-                *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
			
 
				-                +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
			
 
				-                +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = loop_k + 16*il + i < args.ne00 ? ((device T0 *) x)[i] : 0;
			
 
				+                const short sx = 2*il0 + i/8;
			
 
				+                const short sy = (tiitg/NL0)/8;
			
 
				+
			
 
				+                const short lx = i%8;
			
 
				+                const short ly = (tiitg/NL0)%8;
			
 
				+                //const short lx = (tiitg/NL0)%8;
			
 
				+                //const short ly = i%8;
			
 
				+
			
 
				+                *(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
			
 
				             }
			
 
				         } else {
			
 
				             S0_4x4 temp_a;
			
@@ -8468,85 +8704,120 @@ kernel void kernel_mul_mm_id(
 
				             threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				             FOR_UNROLL (short i = 0; i < 16; i++) {
			
 
				-                *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
			
 
				-                +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
			
 
				-                +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
			
 
				+                const short sx = 2*il0 + i/8;
			
 
				+                const short sy = (tiitg/NL0)/8;
			
 
				+
			
 
				+                const short lx = i%8;
			
 
				+                const short ly = (tiitg/NL0)%8;
			
 
				+                //const short lx = (tiitg/NL0)%8;
			
 
				+                //const short ly = i%8;
			
 
				+
			
 
				+                *(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
			
 
				             }
			
 
				         }
			
 
				 
			
 
				         if (FC_mul_mm_bc_inp) {
			
 
				             for (short i = 0; i < 8; ++i) {
			
 
				-                sb[32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL) + i] = loop_k + iy + i < args.ne00 ? (S1) ((device T1 *) y)[i] : 0;
			
 
				+                const short sx = (tiitg%NL1);
			
 
				+                const short sy = (tiitg/NL1)/8;
			
 
				+
			
 
				+                const short lx = i;
			
 
				+                const short ly = (tiitg/NL1)%8;
			
 
				+                //const short lx = (tiitg/NL1)%8;
			
 
				+                //const short ly = i;
			
 
				+
			
 
				+                *(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
			
 
				             }
			
 
				         } else {
			
 
				-            *(threadgroup S1_2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = (S1_2x4)(*((device T1_2x4 *) y));
			
 
				+            const short sx = (tiitg%NL1);
			
 
				+            const short sy = (tiitg/NL1)/8;
			
 
				+
			
 
				+            //const short lx = i;
			
 
				+            const short ly = (tiitg/NL1)%8;
			
 
				+            //const short lx = (tiitg/NL1)%8;
			
 
				+            //const short ly = i;
			
 
				+
			
 
				+            *(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
			
 
				         }
			
 
				+#endif
			
 
				 
			
 
				         il = (il + 2 < nl) ? il + 2 : il % 2;
			
 
				         x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
			
 
				-        y += BLOCK_SIZE_K;
			
 
				+
			
 
				+        y += NK;
			
 
				 
			
 
				         threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				+#ifndef GGML_METAL_HAS_TENSOR
			
 
				         // load matrices from threadgroup memory and conduct outer products
			
 
				-        threadgroup const S0 * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
			
 
				-        threadgroup const S1 * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
			
 
				-
			
 
				-        #pragma unroll(4)
			
 
				-        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
			
 
				-            #pragma unroll(4)
			
 
				-            for (short i = 0; i < 4; i++) {
			
 
				-                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
			
 
				+        threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
			
 
				+        threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
			
 
				+
			
 
				+        FOR_UNROLL (short ik = 0; ik < NK/8; ik++) {
			
 
				+            simdgroup_barrier(mem_flags::mem_none);
			
 
				+
			
 
				+            FOR_UNROLL (short i = 0; i < 4; i++) {
			
 
				+                simdgroup_load(ma[i], lsma + 64*i, 8, 0, false);
			
 
				             }
			
 
				 
			
 
				             simdgroup_barrier(mem_flags::mem_none);
			
 
				 
			
 
				-            #pragma unroll(2)
			
 
				-            for (short i = 0; i < 2; i++) {
			
 
				-                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
			
 
				+            FOR_UNROLL (short i = 0; i < 2; i++) {
			
 
				+                simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false);
			
 
				             }
			
 
				 
			
 
				-            #pragma unroll(8)
			
 
				-            for (short i = 0; i < 8; i++){
			
 
				+            simdgroup_barrier(mem_flags::mem_none);
			
 
				+
			
 
				+            FOR_UNROLL (short i = 0; i < 8; i++){
			
 
				                 simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
			
 
				             }
			
 
				 
			
 
				-            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
			
 
				-            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
			
 
				+            lsma += 8*64;
			
 
				+            lsmb += 4*64;
			
 
				         }
			
 
				+#else
			
 
				+        auto sA = tA.slice(0, 0);
			
 
				+        auto sB = tB.slice(0, 0);
			
 
				+
			
 
				+        mm.run(sB, sA, cT);
			
 
				+#endif
			
 
				     }
			
 
				 
			
 
				+    // block is smaller than 64x32, we should avoid writing data outside of the matrix
			
 
				     threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				-    threadgroup float * temp_str = ((threadgroup float *) shmem) \
			
 
				-                                 + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
			
 
				+#ifdef GGML_METAL_HAS_TENSOR
			
 
				+    auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
			
 
				+    cT.store(tC);
			
 
				+#else
			
 
				+    threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
			
 
				 
			
 
				-    #pragma unroll(8)
			
 
				     for (short i = 0; i < 8; i++) {
			
 
				-        simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
			
 
				+        simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
			
 
				     }
			
 
				+#endif
			
 
				 
			
 
				     threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				 
			
 
				-    for (short j = sgitg; j < n_cols; j += 4) {
			
 
				-        const int id = ids_i32[im*args.ne21 + r1*BLOCK_SIZE_N + j];
			
 
				+    for (short j = sgitg; j < nr1; j += 4) {
			
 
				+        const int id = ids_i32[im*args.ne21 + r1 + j];
			
 
				 
			
 
				         const short ide = id % args.ne20;
			
 
				         const short idt = id / args.ne20;
			
 
				 
			
 
				-        device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + ide*args.ne0 + idt*args.ne1*args.ne0;
			
 
				+        device float  * D  = (device float  *) dst + r0 + ide*args.ne0 + idt*args.ne1*args.ne0;
			
 
				         device float4 * D4 = (device float4 *) D;
			
 
				 
			
 
				-        threadgroup float  * C  = (threadgroup float  *) shmem + (j*BLOCK_SIZE_M);
			
 
				+        threadgroup float  * C  = (threadgroup float  *) shmem + j*NR0;
			
 
				         threadgroup float4 * C4 = (threadgroup float4 *) C;
			
 
				 
			
 
				         int i = tiisg;
			
 
				-        for (; i < n_rows/4; i += 32) {
			
 
				+        for (; i < nr0/4; i += 32) {
			
 
				             *(D4 + i) = *(C4 + i);
			
 
				         }
			
 
				 
			
 
				-        i = (4*(n_rows/4)) + tiisg;
			
 
				-        for (; i < n_rows; i += 32) {
			
 
				+        i = (4*(nr0/4)) + tiisg;
			
 
				+        for (; i < nr0; i += 32) {
			
 
				             *(D + i) = *(C + i);
			
 
				         }
			
 
				     }