hai 7 meses · 4c763c8d1b
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
 
															     mul_mv_q4_0_f32_1d_8x_flat
														
 
															     mul_mv_q4_0_f32_1d_16x_flat
														
 
															     mul_mv_q6_k
														
 
															+    mul_mv_id_q4_0_f32_8x_flat
														
 
															     mul
														
 
															     norm
														
 
															     relu
														
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -321,6 +321,7 @@ struct ggml_backend_opencl_context {
 
															     cl_program program_upscale;
														
 
															     cl_program program_concat;
														
 
															     cl_program program_tsembd;
														
 
															+    cl_program program_mul_mv_id_q4_0_f32_8x_flat;
														
 
															     cl_kernel kernel_add, kernel_add_row;
														
 
															     cl_kernel kernel_mul, kernel_mul_row;
														
@@ -366,6 +367,7 @@ struct ggml_backend_opencl_context {
 
															     cl_kernel kernel_concat_f32_contiguous;
														
 
															     cl_kernel kernel_concat_f32_non_contiguous;
														
 
															     cl_kernel kernel_timestep_embedding;
														
 
															+    cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
														
 
															 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
														
 
															     // Transpose kernels
														
@@ -1112,7 +1114,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 
															         GGML_LOG_CONT(".");
														
 
															     }
														
 
															-        // repeat
														
 
															+    // repeat
														
 
															     {
														
 
															 #ifdef GGML_OPENCL_EMBED_KERNELS
														
 
															         const std::string kernel_src {
														
@@ -1256,6 +1258,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 
															         }
														
 
															     }
														
 
															+    // mul_mv_id_q4_0_f32_8x_flat
														
 
															+    {
														
 
															+#ifdef GGML_OPENCL_EMBED_KERNELS
														
 
															+        const std::string kernel_src {
														
 
															+            #include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
														
 
															+        };
														
 
															+#else
														
 
															+        const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
														
 
															+#endif
														
 
															+        backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
														
 
															+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
														
 
															+
														
 
															+        CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
														
 
															+        GGML_LOG_CONT(".");
														
 
															+    }
														
 
															+
														
 
															     // Adreno kernels
														
 
															 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
														
 
															     // transpose
														
@@ -2178,6 +2196,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
 
															                 return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
														
 
															             }
														
 
															             return false;
														
 
															+        case GGML_OP_MUL_MAT_ID:
														
 
															+            if (op->src[0]->type == GGML_TYPE_Q4_0) {
														
 
															+                if (op->src[1]->type == GGML_TYPE_F32) {
														
 
															+                    return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
														
 
															+                }
														
 
															+            }
														
 
															+            return false;
														
 
															         case GGML_OP_RESHAPE:
														
 
															         case GGML_OP_VIEW:
														
 
															         case GGML_OP_PERMUTE:
														
@@ -5536,6 +5561,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
 
															     }
														
 
															 }
														
 
															+static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
														
 
															+    GGML_ASSERT(src0);
														
 
															+    GGML_ASSERT(src0->extra);
														
 
															+    GGML_ASSERT(src1);
														
 
															+    GGML_ASSERT(src1->extra);
														
 
															+    GGML_ASSERT(dst);
														
 
															+    GGML_ASSERT(dst->extra);
														
 
															+
														
 
															+    const ggml_tensor * src2 = dst->src[2];
														
 
															+    GGML_ASSERT(src2);
														
 
															+    GGML_ASSERT(src2->extra);
														
 
															+
														
 
															+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
														
 
															+    cl_command_queue queue = backend_ctx->queue;
														
 
															+
														
 
															+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
														
 
															+    ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
														
 
															+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
														
 
															+
														
 
															+    cl_ulong offset1 = extra1->offset + src1->view_offs;
														
 
															+    cl_ulong offset2 = extra2->offset + src2->view_offs;
														
 
															+    cl_ulong offsetd = extrad->offset + dst->view_offs;
														
 
															+
														
 
															+#ifdef GGML_OPENCL_SOA_Q
														
 
															+    ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
														
 
															+#endif
														
 
															+
														
 
															+    const int ne00 = src0->ne[0];
														
 
															+    const int ne01 = src0->ne[1];
														
 
															+    const int ne02 = src0->ne[2];
														
 
															+    const int ne03 = src0->ne[3];
														
 
															+
														
 
															+    const cl_ulong nb00 = src0->nb[0];
														
 
															+    const cl_ulong nb02 = src0->nb[2];
														
 
															+
														
 
															+    const int ne10 = src1->ne[0];
														
 
															+    const int ne11 = src1->ne[1];
														
 
															+    const int ne12 = src1->ne[2];
														
 
															+    const int ne13 = src1->ne[3];
														
 
															+
														
 
															+    const cl_ulong nb11 = src1->nb[1];
														
 
															+    const cl_ulong nb12 = src1->nb[2];
														
 
															+
														
 
															+    const int ne20 = src2->ne[0];
														
 
															+    const int ne21 = src2->ne[1];
														
 
															+
														
 
															+    const cl_ulong nb21 = src2->nb[1];
														
 
															+
														
 
															+    const int ne0 = dst->ne[0];
														
 
															+    const int ne1 = dst->ne[1];
														
 
															+
														
 
															+    const int r2 = ne12/ne02;
														
 
															+    const int r3 = ne13/ne03;
														
 
															+    const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
														
 
															+
														
 
															+    GGML_ASSERT(ne00 == ne10);
														
 
															+
														
 
															+    int sgs   = 32; // subgroup size
														
 
															+    int nsg   = 1;  // number of subgroups
														
 
															+    int nrows = 1;  // number of row in src1
														
 
															+    int ndst  = 4;  // number of values produced by each subgroup
														
 
															+
														
 
															+    cl_kernel kernel;
														
 
															+
														
 
															+    // subgroup mat vec
														
 
															+    switch (src0->type) {
														
 
															+        case GGML_TYPE_Q4_0: {
														
 
															+            kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
														
 
															+
														
 
															+            if (backend_ctx->gpu_family == INTEL) {
														
 
															+                sgs  = 16;
														
 
															+                nsg  = 1;
														
 
															+                ndst = 8;
														
 
															+            } else if (backend_ctx->gpu_family == ADRENO) {
														
 
															+                sgs  = 64;
														
 
															+                nsg  = 1;
														
 
															+                ndst = 8;
														
 
															+            } else {
														
 
															+                GGML_ASSERT(false && "TODO: Unknown GPU");
														
 
															+            }
														
 
															+
														
 
															+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
														
 
															+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_0->d));
														
 
															+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
														
 
															+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
														
 
															+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extra2->data_device));
														
 
															+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
														
 
															+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
														
 
															+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
														
 
															+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
														
 
															+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne01));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne02));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne11));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne12));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne20));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int),      &ne21));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne0));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &ne1));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r2));
														
 
															+            CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &r3));
														
 
															+
														
 
															+            break;
														
 
															+        }
														
 
															+        default:
														
 
															+            GGML_ASSERT(false && "not implemented");;
														
 
															+    }
														
 
															+
														
 
															+    int _ne1 = 1;
														
 
															+    int ne123 = dst_rows;
														
 
															+
														
 
															+    size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
														
 
															+    size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
														
 
															+
														
 
															+#ifdef GGML_OPENCL_PROFILING
														
 
															+    cl_event evt;
														
 
															+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
														
 
															+
														
 
															+    g_profiling_info.emplace_back();
														
 
															+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
														
 
															+#else
														
 
															+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															 static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
														
 
															     GGML_ASSERT(src0);
														
 
															     GGML_ASSERT(src0->extra);
														
@@ -6444,6 +6599,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
 
															             }
														
 
															             func = ggml_cl_mul_mat;
														
 
															             break;
														
 
															+        case GGML_OP_MUL_MAT_ID:
														
 
															+            if (!any_on_device) {
														
 
															+                return false;
														
 
															+            }
														
 
															+            func = ggml_cl_mul_mat_id;
														
 
															+            break;
														
 
															         case GGML_OP_SCALE:
														
 
															             if (!any_on_device) {
														
 
															                 return false;
														
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
@@ -0,0 +1,283 @@
 
															+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
														
 
															+
														
 
															+#ifdef cl_intel_subgroups
														
 
															+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
														
 
															+#else
														
 
															+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
														
 
															+#endif
														
 
															+
														
 
															+#ifdef cl_intel_required_subgroup_size
														
 
															+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
														
 
															+#define INTEL_GPU 1
														
 
															+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
														
 
															+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
														
 
															+#elif defined(cl_qcom_reqd_sub_group_size)
														
 
															+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
														
 
															+#define ADRENO_GPU 1
														
 
															+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
														
 
															+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
														
 
															+#endif
														
 
															+
														
 
															+#define QK4_0                   32
														
 
															+
														
 
															+typedef char int8_t;
														
 
															+typedef uchar uint8_t;
														
 
															+typedef short int16_t;
														
 
															+typedef ushort uint16_t;
														
 
															+typedef int int32_t;
														
 
															+typedef uint uint32_t;
														
 
															+
														
 
															+//------------------------------------------------------------------------------
														
 
															+// block_q4_0
														
 
															+//------------------------------------------------------------------------------
														
 
															+struct block_q4_0
														
 
															+{
														
 
															+    half d;
														
 
															+    uint8_t qs[QK4_0 / 2];
														
 
															+};
														
 
															+
														
 
															+// This function requires the original shuffled weights.
														
 
															+// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
														
 
															+// packed together in a byte, so are (q[1], q[17]) and so on.
														
 
															+inline float block_q_4_0_dot_y_flat(
														
 
															+        global uchar * x,
														
 
															+        global half  * dh,
														
 
															+        float sumy,
														
 
															+        float16 yl,
														
 
															+        int il
														
 
															+) {
														
 
															+    float           d   = *dh;
														
 
															+    global ushort * qs  = ((global ushort *)x + il/2);
														
 
															+    float           acc = 0.f;
														
 
															+
														
 
															+    acc += yl.s0 * (qs[0] & 0x000F);
														
 
															+    acc += yl.s1 * (qs[0] & 0x0F00);
														
 
															+    acc += yl.s8 * (qs[0] & 0x00F0);
														
 
															+    acc += yl.s9 * (qs[0] & 0xF000);
														
 
															+
														
 
															+    acc += yl.s2 * (qs[1] & 0x000F);
														
 
															+    acc += yl.s3 * (qs[1] & 0x0F00);
														
 
															+    acc += yl.sa * (qs[1] & 0x00F0);
														
 
															+    acc += yl.sb * (qs[1] & 0xF000);
														
 
															+
														
 
															+    acc += yl.s4 * (qs[2] & 0x000F);
														
 
															+    acc += yl.s5 * (qs[2] & 0x0F00);
														
 
															+    acc += yl.sc * (qs[2] & 0x00F0);
														
 
															+    acc += yl.sd * (qs[2] & 0xF000);
														
 
															+
														
 
															+    acc += yl.s6 * (qs[3] & 0x000F);
														
 
															+    acc += yl.s7 * (qs[3] & 0x0F00);
														
 
															+    acc += yl.se * (qs[3] & 0x00F0);
														
 
															+    acc += yl.sf * (qs[3] & 0xF000);
														
 
															+
														
 
															+    return d * (sumy * -8.f + acc);
														
 
															+}
														
 
															+
														
 
															+//
														
 
															+// This variant outputs 8 values.
														
 
															+//
														
 
															+#undef N_DST
														
 
															+#undef N_SIMDGROUP
														
 
															+#undef N_SIMDWIDTH
														
 
															+
														
 
															+#ifdef INTEL_GPU
														
 
															+#define N_DST 8 // each SIMD group works on 8 rows
														
 
															+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
														
 
															+#define N_SIMDWIDTH 16 // subgroup size
														
 
															+#elif defined (ADRENO_GPU)
														
 
															+#define N_DST 8
														
 
															+#define N_SIMDGROUP 1
														
 
															+#define N_SIMDWIDTH 64
														
 
															+#endif
														
 
															+
														
 
															+inline void mul_vec_q_n_f32_8x_flat(
														
 
															+        global char  * src0_q,
														
 
															+        global half  * src0_d,
														
 
															+        global float * src1,
														
 
															+        global float * dst,
														
 
															+        int ne00,
														
 
															+        int ne01,
														
 
															+        int ne02,
														
 
															+        int ne10,
														
 
															+        int ne12,
														
 
															+        int ne0,
														
 
															+        int ne1,
														
 
															+        int r2,
														
 
															+        int r3
														
 
															+) {
														
 
															+    const ulong nb = ne00/QK4_0;
														
 
															+
														
 
															+    int r0 = get_group_id(0);
														
 
															+    int r1 = get_group_id(1);
														
 
															+    int im = 0;
														
 
															+
														
 
															+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
														
 
															+
														
 
															+    int i12 = im%ne12;
														
 
															+    int i13 = im/ne12;
														
 
															+
														
 
															+    // The number of scales is the same as the number of blocks.
														
 
															+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
														
 
															+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
														
 
															+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
														
 
															+
														
 
															+    global uchar * x = (global uchar *) src0_q + offset0_q;
														
 
															+    global half  * d = (global half  *) src0_d + offset0_d;
														
 
															+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
														
 
															+
														
 
															+    float16 yl;
														
 
															+    float8 sumf = 0.f;
														
 
															+
														
 
															+    int ix = get_sub_group_local_id()/2;
														
 
															+    int il = 8*(get_sub_group_local_id()%2);
														
 
															+
														
 
															+    global float * yb = y + ix*QK4_0 + il;
														
 
															+
														
 
															+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
														
 
															+        float sumy = 0.f;
														
 
															+
														
 
															+        sumy += yb[0];
														
 
															+        sumy += yb[1];
														
 
															+        sumy += yb[2];
														
 
															+        sumy += yb[3];
														
 
															+        sumy += yb[4];
														
 
															+        sumy += yb[5];
														
 
															+        sumy += yb[6];
														
 
															+        sumy += yb[7];
														
 
															+
														
 
															+        sumy += yb[16];
														
 
															+        sumy += yb[17];
														
 
															+        sumy += yb[18];
														
 
															+        sumy += yb[19];
														
 
															+        sumy += yb[20];
														
 
															+        sumy += yb[21];
														
 
															+        sumy += yb[22];
														
 
															+        sumy += yb[23];
														
 
															+
														
 
															+        yl.s0 = yb[0];
														
 
															+        yl.s1 = yb[1]/256.f;
														
 
															+
														
 
															+        yl.s2 = yb[2];
														
 
															+        yl.s3 = yb[3]/256.f;
														
 
															+
														
 
															+        yl.s4 = yb[4];
														
 
															+        yl.s5 = yb[5]/256.f;
														
 
															+
														
 
															+        yl.s6 = yb[6];
														
 
															+        yl.s7 = yb[7]/256.f;
														
 
															+
														
 
															+        yl.s8 = yb[16]/16.f;
														
 
															+        yl.s9 = yb[17]/4096.f;
														
 
															+
														
 
															+        yl.sa = yb[18]/16.f;
														
 
															+        yl.sb = yb[19]/4096.f;
														
 
															+
														
 
															+        yl.sc = yb[20]/16.f;
														
 
															+        yl.sd = yb[21]/4096.f;
														
 
															+
														
 
															+        yl.se = yb[22]/16.f;
														
 
															+        yl.sf = yb[23]/4096.f;
														
 
															+
														
 
															+        sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
														
 
															+        sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
														
 
															+        sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
														
 
															+        sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
														
 
															+
														
 
															+        sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
														
 
															+        sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
														
 
															+        sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
														
 
															+        sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
														
 
															+
														
 
															+        yb += QK4_0 * (N_SIMDWIDTH/2);
														
 
															+    }
														
 
															+
														
 
															+    float8 tot = (float8)(
														
 
															+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
														
 
															+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
														
 
															+        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
														
 
															+        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
														
 
															+    );
														
 
															+
														
 
															+    if (get_sub_group_local_id() == 0) {
														
 
															+        if (first_row + 0 < ne01) {
														
 
															+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
														
 
															+        }
														
 
															+        if (first_row + 1 < ne01) {
														
 
															+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
														
 
															+        }
														
 
															+        if (first_row + 2 < ne01) {
														
 
															+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
														
 
															+        }
														
 
															+        if (first_row + 3 < ne01) {
														
 
															+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
														
 
															+        }
														
 
															+
														
 
															+        if (first_row + 4 < ne01) {
														
 
															+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
														
 
															+        }
														
 
															+        if (first_row + 5 < ne01) {
														
 
															+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
														
 
															+        }
														
 
															+        if (first_row + 6 < ne01) {
														
 
															+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
														
 
															+        }
														
 
															+        if (first_row + 7 < ne01) {
														
 
															+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
														
 
															+        }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+#ifdef INTEL_GPU
														
 
															+REQD_SUBGROUP_SIZE_16
														
 
															+#elif defined (ADRENO_GPU)
														
 
															+REQD_SUBGROUP_SIZE_64
														
 
															+#endif
														
 
															+kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
														
 
															+        global char  *  src0_q,
														
 
															+        global half  *  src0_d,
														
 
															+        global float *  src1,
														
 
															+        ulong           offset1,
														
 
															+        global char  *  src2,
														
 
															+        ulong           offset2,
														
 
															+        global float *  dst,
														
 
															+        ulong           offsetd,
														
 
															+        int             ne00,
														
 
															+        int             ne01,
														
 
															+        int             ne02,
														
 
															+        ulong           nb00,
														
 
															+        ulong           nb02,
														
 
															+        int             ne10,
														
 
															+        int             ne11,
														
 
															+        int             ne12,
														
 
															+        ulong           nb11,
														
 
															+        ulong           nb12,
														
 
															+        int             ne20,
														
 
															+        int             ne21,
														
 
															+        ulong           nb21,
														
 
															+        int             ne0,
														
 
															+        int             ne1,
														
 
															+        int             r2,
														
 
															+        int             r3
														
 
															+) {
														
 
															+    src1 = (global float *)((global char *)src1 + offset1);
														
 
															+    src2 = (global char  *)((global char *)src2 + offset2);
														
 
															+    dst  = (global float *)((global char *)dst  + offsetd);
														
 
															+
														
 
															+    const int iid1 = get_group_id(2)/ne20;
														
 
															+    const int idx  = get_group_id(2)%ne20;
														
 
															+
														
 
															+    const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
														
 
															+
														
 
															+    const int i11 = idx%ne11;
														
 
															+    const int i12 = iid1;
														
 
															+
														
 
															+    const int i1 = idx;
														
 
															+    const int i2 = i12;
														
 
															+
														
 
															+    global char  * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
														
 
															+    global half  * src0_d_cur = src0_d + (i02*nb02/nb00);
														
 
															+    global float * src1_cur   = (global float *)((global char *) src1  + i11*nb11 + i12*nb12);
														
 
															+    global float * dst_cur    = dst + i1*ne0 + i2*ne1*ne0;
														
 
															+
														
 
															+    mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
														
 
															+}