2 месяцев назад · 0cfb19166b
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -1438,6 +1438,30 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d(ggml_met
 
															     return res;
														
 
															 }
														
 
															+ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_2d(ggml_metal_library_t lib, const ggml_tensor * op) {
														
 
															+    assert(op->op == GGML_OP_CONV_2D);
														
 
															+
														
 
															+    GGML_ASSERT(ggml_is_contiguous(op->src[0]));
														
 
															+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(op->type         == GGML_TYPE_F32);
														
 
															+
														
 
															+    char base[256];
														
 
															+    char name[256];
														
 
															+
														
 
															+    snprintf(base, 256, "kernel_conv_2d_%s_%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type));
														
 
															+    snprintf(name, 256, "%s", base);
														
 
															+
														
 
															+    ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
														
 
															+    if (res) {
														
 
															+        return res;
														
 
															+    }
														
 
															+
														
 
															+    res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
														
 
															+
														
 
															+    return res;
														
 
															+}
														
 
															+
														
 
															 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale(ggml_metal_library_t lib, const ggml_tensor * op) {
														
 
															     assert(op->op == GGML_OP_UPSCALE);
														
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -133,6 +133,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope              (ggml_me
 
															 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col            (ggml_metal_library_t lib, const struct ggml_tensor * op);
														
 
															 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
														
 
															 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
														
 
															+ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_2d           (ggml_metal_library_t lib, const struct ggml_tensor * op);
														
 
															 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale           (ggml_metal_library_t lib, const struct ggml_tensor * op);
														
 
															 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad               (ggml_metal_library_t lib, const struct ggml_tensor * op);
														
 
															 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad_reflect_1d    (ggml_metal_library_t lib, const struct ggml_tensor * op);
														
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -885,6 +885,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
 
															             return true;
														
 
															         case GGML_OP_IM2COL:
														
 
															             return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
														
 
															+        case GGML_OP_CONV_2D:
														
 
															+            return ggml_is_contiguous(op->src[0]) &&
														
 
															+                   op->src[1]->type == GGML_TYPE_F32 &&
														
 
															+                   op->type == GGML_TYPE_F32 &&
														
 
															+                   (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
														
 
															         case GGML_OP_POOL_1D:
														
 
															             return false;
														
 
															         case GGML_OP_UPSCALE:
														
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -528,6 +528,36 @@ typedef struct {
 
															     uint64_t nb2;
														
 
															 } ggml_metal_kargs_conv_transpose_2d;
														
 
															+typedef struct {
														
 
															+    uint64_t nb00;
														
 
															+    uint64_t nb01;
														
 
															+    uint64_t nb02;
														
 
															+    uint64_t nb03;
														
 
															+    uint64_t nb10;
														
 
															+    uint64_t nb11;
														
 
															+    uint64_t nb12;
														
 
															+    uint64_t nb13;
														
 
															+    uint64_t nb0;
														
 
															+    uint64_t nb1;
														
 
															+    uint64_t nb2;
														
 
															+    uint64_t nb3;
														
 
															+    int32_t  IW;
														
 
															+    int32_t  IH;
														
 
															+    int32_t  KW;
														
 
															+    int32_t  KH;
														
 
															+    int32_t  IC;
														
 
															+    int32_t  OC;
														
 
															+    int32_t  OW;
														
 
															+    int32_t  OH;
														
 
															+    int32_t  N;
														
 
															+    int32_t  s0;
														
 
															+    int32_t  s1;
														
 
															+    int32_t  p0;
														
 
															+    int32_t  p1;
														
 
															+    int32_t  d0;
														
 
															+    int32_t  d1;
														
 
															+} ggml_metal_kargs_conv_2d;
														
 
															+
														
 
															 typedef struct {
														
 
															     uint64_t  ofs0;
														
 
															     uint64_t  ofs1;
														
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -10,6 +10,7 @@
 
															 #include <cassert>
														
 
															 #include <algorithm>
														
 
															+#include <limits>
														
 
															 static ggml_metal_buffer_id ggml_metal_get_buffer_id(const ggml_tensor * t) {
														
 
															     if (!t) {
														
@@ -364,6 +365,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
 
															             {
														
 
															                 n_fuse = ggml_metal_op_im2col(ctx, idx);
														
 
															             } break;
														
 
															+        case GGML_OP_CONV_2D:
														
 
															+            {
														
 
															+                n_fuse = ggml_metal_op_conv_2d(ctx, idx);
														
 
															+            } break;
														
 
															         case GGML_OP_CONV_TRANSPOSE_1D:
														
 
															             {
														
 
															                 n_fuse = ggml_metal_op_conv_transpose_1d(ctx, idx);
														
@@ -1036,11 +1041,6 @@ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
 
															     nth = std::min(nth, nk0);
														
 
															-    if (nth*nrptg > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
														
 
															-        nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
														
 
															-        nrptg = 1;
														
 
															-    }
														
 
															-
														
 
															     ggml_metal_kargs_set_rows args = {
														
 
															         /*.nk0  =*/ nk0,
														
 
															         /*.ne01 =*/ ne01,
														
@@ -3082,6 +3082,84 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
 
															     return 1;
														
 
															 }
														
 
															+int ggml_metal_op_conv_2d(ggml_metal_op_t ctx, int idx) {
														
 
															+    ggml_tensor * op = ctx->node(idx);
														
 
															+
														
 
															+    ggml_metal_library_t lib = ctx->lib;
														
 
															+    ggml_metal_encoder_t enc = ctx->enc;
														
 
															+
														
 
															+    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
														
 
															+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
														
 
															+    GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
														
 
															+    GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
														
 
															+    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
														
 
															+    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
														
 
															+
														
 
															+    GGML_ASSERT(ggml_is_contiguous(op->src[0]));
														
 
															+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(op->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
														
 
															+
														
 
															+    const int32_t s0 = ((const int32_t *) op->op_params)[0];
														
 
															+    const int32_t s1 = ((const int32_t *) op->op_params)[1];
														
 
															+    const int32_t p0 = ((const int32_t *) op->op_params)[2];
														
 
															+    const int32_t p1 = ((const int32_t *) op->op_params)[3];
														
 
															+    const int32_t d0 = ((const int32_t *) op->op_params)[4];
														
 
															+    const int32_t d1 = ((const int32_t *) op->op_params)[5];
														
 
															+
														
 
															+    ggml_metal_kargs_conv_2d args = {
														
 
															+        /*.nb00 =*/ nb00,
														
 
															+        /*.nb01 =*/ nb01,
														
 
															+        /*.nb02 =*/ nb02,
														
 
															+        /*.nb03 =*/ nb03,
														
 
															+        /*.nb10 =*/ nb10,
														
 
															+        /*.nb11 =*/ nb11,
														
 
															+        /*.nb12 =*/ nb12,
														
 
															+        /*.nb13 =*/ nb13,
														
 
															+        /*.nb0  =*/ nb0,
														
 
															+        /*.nb1  =*/ nb1,
														
 
															+        /*.nb2  =*/ nb2,
														
 
															+        /*.nb3  =*/ nb3,
														
 
															+        /*.IW   =*/ ne10,
														
 
															+        /*.IH   =*/ ne11,
														
 
															+        /*.KW   =*/ ne00,
														
 
															+        /*.KH   =*/ ne01,
														
 
															+        /*.IC   =*/ ne02,
														
 
															+        /*.OC   =*/ ne03,
														
 
															+        /*.OW   =*/ ne0,
														
 
															+        /*.OH   =*/ ne1,
														
 
															+        /*.N    =*/ ne3,
														
 
															+        /*.s0   =*/ s0,
														
 
															+        /*.s1   =*/ s1,
														
 
															+        /*.p0   =*/ p0,
														
 
															+        /*.p1   =*/ p1,
														
 
															+        /*.d0   =*/ d0,
														
 
															+        /*.d1   =*/ d1,
														
 
															+    };
														
 
															+
														
 
															+    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_2d(lib, op);
														
 
															+
														
 
															+    int nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
														
 
															+    nth = std::min(nth, 256);
														
 
															+    nth = std::max(nth, 1);
														
 
															+
														
 
															+    const uint64_t n_out = ggml_nelements(op);
														
 
															+
														
 
															+    uint64_t tg = (n_out + nth - 1)/nth;
														
 
															+    tg = std::max<uint64_t>(tg, 1);
														
 
															+    tg = std::min<uint64_t>(tg, (uint64_t) std::numeric_limits<int>::max());
														
 
															+
														
 
															+    ggml_metal_encoder_set_pipeline(enc, pipeline);
														
 
															+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
														
 
															+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
														
 
															+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
														
 
															+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
														
 
															+
														
 
															+    ggml_metal_encoder_dispatch_threadgroups(enc, tg, 1, 1, nth, 1, 1);
														
 
															+
														
 
															+    return 1;
														
 
															+}
														
 
															+
														
 
															 int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) {
														
 
															     ggml_tensor * op = ctx->node(idx);
														
--- a/ggml/src/ggml-metal/ggml-metal-ops.h
+++ b/ggml/src/ggml-metal/ggml-metal-ops.h
@@ -70,6 +70,7 @@ int ggml_metal_op_group_norm        (ggml_metal_op_t ctx, int idx);
 
															 int ggml_metal_op_norm              (ggml_metal_op_t ctx, int idx);
														
 
															 int ggml_metal_op_rope              (ggml_metal_op_t ctx, int idx);
														
 
															 int ggml_metal_op_im2col            (ggml_metal_op_t ctx, int idx);
														
 
															+int ggml_metal_op_conv_2d           (ggml_metal_op_t ctx, int idx);
														
 
															 int ggml_metal_op_conv_transpose_1d (ggml_metal_op_t ctx, int idx);
														
 
															 int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx);
														
 
															 int ggml_metal_op_upscale           (ggml_metal_op_t ctx, int idx);
														
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4146,6 +4146,120 @@ template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
 
															 //template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
														
 
															 //template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
														
 
															+template <typename TK>
														
 
															+kernel void kernel_conv_2d(
														
 
															+        constant ggml_metal_kargs_conv_2d & args,
														
 
															+        device const char * weights,
														
 
															+        device const char * src,
														
 
															+        device       char * dst,
														
 
															+        uint3   tgpig[[threadgroup_position_in_grid]],
														
 
															+        uint3    tgpg[[threadgroups_per_grid]],
														
 
															+        uint3   tpitg[[thread_position_in_threadgroup]],
														
 
															+        uint3     ntg[[threads_per_threadgroup]]) {
														
 
															+
														
 
															+    const uint threads_per_tg = ntg.x * ntg.y * ntg.z;
														
 
															+    const uint tg_index = (tgpig.z * tgpg.y + tgpig.y) * tgpg.x + tgpig.x;
														
 
															+    const uint local_thread = tpitg.z * (ntg.x * ntg.y) + tpitg.y * ntg.x + tpitg.x;
														
 
															+    const uint thread_index = tg_index * threads_per_tg + local_thread;
														
 
															+    const uint64_t total_threads = (uint64_t) threads_per_tg * tgpg.x * tgpg.y * tgpg.z;
														
 
															+    const uint64_t total_outputs = (uint64_t) args.N * args.OC * args.OH * args.OW;
														
 
															+
														
 
															+    for (uint64_t index = thread_index; index < total_outputs; index += total_threads) {
														
 
															+        uint64_t tmp = index;
														
 
															+
														
 
															+        const int32_t ow = tmp % args.OW; tmp /= args.OW;
														
 
															+        const int32_t oh = tmp % args.OH; tmp /= args.OH;
														
 
															+        const int32_t oc = tmp % args.OC; tmp /= args.OC;
														
 
															+        const int32_t  n = tmp;
														
 
															+
														
 
															+        float acc = 0.0f;
														
 
															+
														
 
															+        const int32_t base_x = ow*args.s0 - args.p0;
														
 
															+        const int32_t base_y = oh*args.s1 - args.p1;
														
 
															+
														
 
															+        int32_t ky_start = 0;
														
 
															+        if (base_y < 0) {
														
 
															+            ky_start = (-base_y + args.d1 - 1)/args.d1;
														
 
															+        }
														
 
															+        int32_t ky_end = args.KH;
														
 
															+        const int32_t y_max = args.IH - 1 - base_y;
														
 
															+        if (y_max < 0) {
														
 
															+            ky_end = ky_start;
														
 
															+        } else if (base_y + (args.KH - 1)*args.d1 >= args.IH) {
														
 
															+            ky_end = min(ky_end, y_max/args.d1 + 1);
														
 
															+        }
														
 
															+
														
 
															+        int32_t kx_start = 0;
														
 
															+        if (base_x < 0) {
														
 
															+            kx_start = (-base_x + args.d0 - 1)/args.d0;
														
 
															+        }
														
 
															+        int32_t kx_end = args.KW;
														
 
															+        const int32_t x_max = args.IW - 1 - base_x;
														
 
															+        if (x_max < 0) {
														
 
															+            kx_end = kx_start;
														
 
															+        } else if (base_x + (args.KW - 1)*args.d0 >= args.IW) {
														
 
															+            kx_end = min(kx_end, x_max/args.d0 + 1);
														
 
															+        }
														
 
															+
														
 
															+        if (ky_start < ky_end && kx_start < kx_end) {
														
 
															+            const uint64_t src_base_n = (uint64_t) n  * args.nb13;
														
 
															+            const uint64_t w_base_oc  = (uint64_t) oc * args.nb03;
														
 
															+
														
 
															+            for (int32_t ic = 0; ic < args.IC; ++ic) {
														
 
															+                const uint64_t src_base_nc = src_base_n + (uint64_t) ic * args.nb12;
														
 
															+                const uint64_t w_base_ocic = w_base_oc  + (uint64_t) ic * args.nb02;
														
 
															+
														
 
															+                for (int32_t ky = ky_start; ky < ky_end; ++ky) {
														
 
															+                    const int32_t iy = base_y + ky*args.d1;
														
 
															+                    const uint64_t src_base_row = src_base_nc + (uint64_t) iy * args.nb11;
														
 
															+                    const uint64_t w_base_row   = w_base_ocic + (uint64_t) ky * args.nb01;
														
 
															+
														
 
															+                    for (int32_t kx = kx_start; kx < kx_end; ++kx) {
														
 
															+                        const int32_t ix = base_x + kx*args.d0;
														
 
															+                        const uint64_t src_offs = src_base_row + (uint64_t) ix * args.nb10;
														
 
															+                        const uint64_t w_offs   = w_base_row   + (uint64_t) kx * args.nb00;
														
 
															+
														
 
															+                        const float x = *(device const float *)(src + src_offs);
														
 
															+                        const float w = (float) (*(device const TK *)(weights + w_offs));
														
 
															+
														
 
															+                        acc += x * w;
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+
														
 
															+        const uint64_t dst_offs =
														
 
															+            (uint64_t) n  * args.nb3 +
														
 
															+            (uint64_t) oc * args.nb2 +
														
 
															+            (uint64_t) oh * args.nb1 +
														
 
															+            (uint64_t) ow * args.nb0;
														
 
															+
														
 
															+        *(device float *)(dst + dst_offs) = acc;
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+template [[host_name("kernel_conv_2d_f32_f32")]]
														
 
															+kernel void kernel_conv_2d<float>(
														
 
															+        constant ggml_metal_kargs_conv_2d & args,
														
 
															+        device const char * weights,
														
 
															+        device const char * src,
														
 
															+        device       char * dst,
														
 
															+        uint3   tgpig[[threadgroup_position_in_grid]],
														
 
															+        uint3    tgpg[[threadgroups_per_grid]],
														
 
															+        uint3   tpitg[[thread_position_in_threadgroup]],
														
 
															+        uint3     ntg[[threads_per_threadgroup]]);
														
 
															+
														
 
															+template [[host_name("kernel_conv_2d_f16_f32")]]
														
 
															+kernel void kernel_conv_2d<half>(
														
 
															+        constant ggml_metal_kargs_conv_2d & args,
														
 
															+        device const char * weights,
														
 
															+        device const char * src,
														
 
															+        device       char * dst,
														
 
															+        uint3   tgpig[[threadgroup_position_in_grid]],
														
 
															+        uint3    tgpg[[threadgroups_per_grid]],
														
 
															+        uint3   tpitg[[thread_position_in_threadgroup]],
														
 
															+        uint3     ntg[[threads_per_threadgroup]]);
														
 
															+
														
 
															 typedef void (conv_transpose_1d_t)(
														
 
															         constant ggml_metal_kargs_conv_transpose_1d & args,
														
 
															         device const float * src0,