|
|
@@ -402,6 +402,7 @@ struct ggml_backend_opencl_context {
|
|
|
cl_program program_conv_2d_f32;
|
|
|
cl_program program_conv_2d_f16_f32;
|
|
|
cl_program program_tsembd;
|
|
|
+ cl_program program_gemv_moe_mxfp4_f32, program_gemm_moe_mxfp4_f32;
|
|
|
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
|
|
cl_program program_mul_mv_id_q8_0_f32, program_mul_mv_id_q8_0_f32_flat;
|
|
|
cl_program program_mul_mv_id_mxfp4_f32;
|
|
|
@@ -452,7 +453,7 @@ struct ggml_backend_opencl_context {
|
|
|
cl_kernel kernel_mul_mat_f16_f32_tiled;
|
|
|
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
|
|
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
|
|
- cl_kernel kernel_convert_block_mxfp4, kernel_restore_block_mxfp4;
|
|
|
+ cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
|
|
|
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
|
|
|
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
|
|
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
|
|
@@ -475,6 +476,7 @@ struct ggml_backend_opencl_context {
|
|
|
cl_kernel kernel_conv_2d_f32;
|
|
|
cl_kernel kernel_conv_2d_f16_f32;
|
|
|
cl_kernel kernel_timestep_embedding;
|
|
|
+ cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
|
|
|
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
|
|
cl_kernel kernel_mul_mv_id_q8_0_f32, kernel_mul_mv_id_q8_0_f32_flat;
|
|
|
cl_kernel kernel_mul_mv_id_mxfp4_f32;
|
|
|
@@ -559,14 +561,14 @@ struct ggml_backend_opencl_context {
|
|
|
|
|
|
fprintf(ftrace, "[\n");
|
|
|
for (const ProfilingInfo & info : profiling_info) {
|
|
|
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
|
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
|
info.kernel_name.c_str(), info.cmd_queued/1000);
|
|
|
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
|
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
|
info.kernel_name.c_str(), info.cmd_submit/1000);
|
|
|
|
|
|
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
|
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
|
info.kernel_name.c_str(), info.cmd_start/1000);
|
|
|
- fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
|
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
|
info.kernel_name.c_str(), info.cmd_end/1000);
|
|
|
}
|
|
|
fclose(ftrace);
|
|
|
@@ -777,6 +779,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_restore_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4_trans", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
|
|
|
@@ -1991,6 +1995,42 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
|
|
|
GGML_LOG_CONT(".");
|
|
|
}
|
|
|
+
|
|
|
+ std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
|
+ " -cl-mad-enable "
|
|
|
+ " -cl-fast-relaxed-math";
|
|
|
+
|
|
|
+ // gemv_moe_mxfp4_f32
|
|
|
+ {
|
|
|
+#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
|
+ const std::string kernel_src {
|
|
|
+ #include "gemv_moe_mxfp4_f32.cl.h"
|
|
|
+ };
|
|
|
+#else
|
|
|
+ const std::string kernel_src = read_file("gemv_moe_mxfp4_f32.cl");
|
|
|
+#endif
|
|
|
+ backend_ctx->program_gemv_moe_mxfp4_f32 =
|
|
|
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
|
|
+
|
|
|
+ CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemv_moe_mxfp4_f32, "kernel_gemv_moe_mxfp4_f32", &err), err));
|
|
|
+ GGML_LOG_CONT(".");
|
|
|
+ }
|
|
|
+
|
|
|
+ // gemm_moe_mxfp4_f32
|
|
|
+ {
|
|
|
+#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
|
+ const std::string kernel_src {
|
|
|
+ #include "gemm_moe_mxfp4_f32.cl.h"
|
|
|
+ };
|
|
|
+#else
|
|
|
+ const std::string kernel_src = read_file("gemm_moe_mxfp4_f32.cl");
|
|
|
+#endif
|
|
|
+ backend_ctx->program_gemm_moe_mxfp4_f32 =
|
|
|
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
|
|
+
|
|
|
+ CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemm_moe_mxfp4_f32, "kernel_gemm_moe_mxfp4_f32", &err), err));
|
|
|
+ GGML_LOG_CONT(".");
|
|
|
+ }
|
|
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
|
GGML_LOG_CONT("\n");
|
|
|
}
|
|
|
@@ -3299,6 +3339,12 @@ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, c
|
|
|
tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
|
|
}
|
|
|
|
|
|
+inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
|
|
|
+ GGML_UNUSED(backend_ctx);
|
|
|
+ int ne01 = tensor->ne[1];
|
|
|
+ return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
|
|
|
+}
|
|
|
+
|
|
|
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
|
|
|
|
|
|
@@ -3601,14 +3647,39 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
|
CL_CHECK(err);
|
|
|
|
|
|
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
|
+ if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
|
|
+ cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans;
|
|
|
+
|
|
|
+ int ne00 = tensor->ne[0];
|
|
|
+ int ne01 = tensor->ne[1];
|
|
|
+ int ne02 = tensor->ne[2];
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
|
|
|
+
|
|
|
+ size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
|
|
+ size_t local_work_size[3] = {64, 2, 1};
|
|
|
+
|
|
|
+ cl_event evt;
|
|
|
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
|
+ CL_CHECK(clWaitForEvents(1, &evt));
|
|
|
+ CL_CHECK(clReleaseMemObject(data_device));
|
|
|
+ tensor->extra = extra;
|
|
|
+
|
|
|
+ return;
|
|
|
+ }
|
|
|
+#endif
|
|
|
cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4;
|
|
|
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
|
|
|
|
|
|
- size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
|
- size_t local_work_size[] = {64, 1, 1};
|
|
|
+ size_t global_work_size[3] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
|
+ size_t local_work_size[3] = {64, 1, 1};
|
|
|
|
|
|
cl_event evt;
|
|
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
|
@@ -3624,7 +3695,6 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
{ extra->q }
|
|
|
};
|
|
|
extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
|
|
|
-
|
|
|
tensor->extra = extra;
|
|
|
|
|
|
return;
|
|
|
@@ -3751,6 +3821,33 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
ggml_nbytes(tensor), NULL, &err);
|
|
|
CL_CHECK(err);
|
|
|
|
|
|
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
|
+ if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
|
|
+ cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4_trans;
|
|
|
+
|
|
|
+ int ne00 = tensor->ne[0];
|
|
|
+ int ne01 = tensor->ne[1];
|
|
|
+ int ne02 = tensor->ne[2];
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
|
|
|
+
|
|
|
+ size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
|
|
+ size_t local_work_size[3] = {64, 2, 1};
|
|
|
+
|
|
|
+ cl_event evt;
|
|
|
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
|
+ global_work_size, local_work_size, 0, NULL, &evt));
|
|
|
+ CL_CHECK(clWaitForEvents(1, &evt));
|
|
|
+ CL_CHECK(clEnqueueReadBuffer(
|
|
|
+ queue, data_device, CL_TRUE, offset,
|
|
|
+ size, data, 0, NULL, NULL));
|
|
|
+ CL_CHECK(clReleaseMemObject(data_device));
|
|
|
+ return;
|
|
|
+ }
|
|
|
+#endif
|
|
|
cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4;
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
|
|
|
@@ -7553,6 +7650,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
const int ne21 = src2->ne[1];
|
|
|
|
|
|
const cl_ulong nb21 = src2->nb[1];
|
|
|
+ const cl_ulong nb20 = src2->nb[0];
|
|
|
|
|
|
const int ne0 = dst->ne[0];
|
|
|
const int ne1 = dst->ne[1];
|
|
|
@@ -7692,6 +7790,105 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
break;
|
|
|
}
|
|
|
case GGML_TYPE_MXFP4: {
|
|
|
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
|
+ if (use_adreno_moe_kernels(backend_ctx, src0)) {
|
|
|
+ cl_int status;
|
|
|
+
|
|
|
+ size_t local_size[3] = {64, 2, 1};
|
|
|
+ size_t global_size[3] = {64, 2, 1};
|
|
|
+
|
|
|
+ cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
|
|
|
+
|
|
|
+ int tile_size = 320;
|
|
|
+ if (ne12 == 1) { // for gemv
|
|
|
+ kernel = backend_ctx->kernel_gemv_moe_mxfp4_f32;
|
|
|
+
|
|
|
+ // create a sub_buffer for src2
|
|
|
+ cl_buffer_region region;
|
|
|
+ region.origin = offset2;
|
|
|
+ region.size = ne20 * ne21 * sizeof(int);
|
|
|
+ buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
|
+ CL_CHECK(status);
|
|
|
+
|
|
|
+ // set thread grid
|
|
|
+ global_size[0] = static_cast<size_t>(ne01);
|
|
|
+ global_size[1] = 4;
|
|
|
+ global_size[2] = static_cast<size_t>(ne20);
|
|
|
+ local_size[1] = 4;
|
|
|
+ } else { // for gemm
|
|
|
+ kernel = backend_ctx->kernel_gemm_moe_mxfp4_f32;
|
|
|
+
|
|
|
+ // preprocess router table
|
|
|
+ int num_tiles_per_expert = (ne01 + tile_size - 1) / tile_size;
|
|
|
+ void * host_src2_reorder = malloc(ne20 * ne21 * 4 * num_tiles_per_expert * sizeof(short));
|
|
|
+ void * host_src2 = malloc(ne21 * nb21);
|
|
|
+ CL_CHECK(clEnqueueReadBuffer(backend_ctx->queue, extra2->data_device, CL_TRUE, offset2, ne21 * nb21, host_src2, 0, NULL, NULL));
|
|
|
+ int total_experts = nb21 / nb20;
|
|
|
+ int out_idx = 0;
|
|
|
+ for (int i_expert = 0; i_expert < ne02; i_expert++) {
|
|
|
+ for (int i_tile = 0; i_tile < num_tiles_per_expert; i_tile++) {
|
|
|
+ for (int j = 0; j < ne21; j++) {
|
|
|
+ for (int i = 0; i < ne20; i++) {
|
|
|
+ int expert = ((int *)host_src2)[j * total_experts + i];
|
|
|
+ if (i_expert == expert) {
|
|
|
+ ((short *)host_src2_reorder)[out_idx] = static_cast<short>(expert);
|
|
|
+ ((short *)host_src2_reorder)[out_idx + 1] = static_cast<short>(j * ne11 + (i % ne11));
|
|
|
+ ((short *)host_src2_reorder)[out_idx + 2] = static_cast<short>(j * ne20 + i);
|
|
|
+ ((short *)host_src2_reorder)[out_idx + 3] = static_cast<short>(i_tile);
|
|
|
+ out_idx += 4;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ buf_src2 = clCreateBuffer(backend_ctx->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ne20 * ne21 * 4 * num_tiles_per_expert * sizeof(short), host_src2_reorder, &status);
|
|
|
+ CL_CHECK(status);
|
|
|
+
|
|
|
+ // set thread grid
|
|
|
+ global_size[0] = static_cast<size_t>(tile_size);
|
|
|
+ global_size[2] = static_cast<size_t>(ne20 * ne21 * num_tiles_per_expert);
|
|
|
+ }
|
|
|
+
|
|
|
+ // create a sub_buffer for src1
|
|
|
+ cl_buffer_region region;
|
|
|
+ region.origin = offset1;
|
|
|
+ region.size = ne10 * ne11 * ne12 * sizeof(float);
|
|
|
+ src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
|
+ CL_CHECK(status);
|
|
|
+
|
|
|
+ // create image for src1
|
|
|
+ cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
|
|
+ cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
|
|
|
+ buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
|
|
+ CL_CHECK(status);
|
|
|
+
|
|
|
+ // Set kernel args
|
|
|
+ int arg_idx = 0;
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_mxfp4->q));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_mxfp4->e));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
|
|
|
+ if (ne12 == 1) {
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11));
|
|
|
+ } else {
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &tile_size));
|
|
|
+ }
|
|
|
+
|
|
|
+ // launch kernel
|
|
|
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
|
|
|
+
|
|
|
+ // deallocate sub buffers and images
|
|
|
+ CL_CHECK(clReleaseMemObject(src1_sub_buffer));
|
|
|
+ CL_CHECK(clReleaseMemObject(buf_src1_image));
|
|
|
+ CL_CHECK(clReleaseMemObject(buf_src2));
|
|
|
+ return;
|
|
|
+ } // else fallback to generic kernel
|
|
|
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
|
+
|
|
|
#ifdef GGML_OPENCL_SOA_Q
|
|
|
kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat;
|
|
|
|