|
|
@@ -365,6 +365,7 @@ struct ggml_backend_opencl_context {
|
|
|
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
|
|
|
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
|
|
|
cl_program program_mul_mv_q6_K;
|
|
|
+ cl_program program_mul_mv_mxfp4_f32;
|
|
|
cl_program program_mul_mv_f16_f16;
|
|
|
cl_program program_mul_mv_f16_f32_1row;
|
|
|
cl_program program_mul_mv_f16_f32_l4;
|
|
|
@@ -398,6 +399,7 @@ struct ggml_backend_opencl_context {
|
|
|
cl_program program_conv_2d_f16_f32;
|
|
|
cl_program program_tsembd;
|
|
|
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
|
|
+ cl_program program_mul_mv_id_mxfp4_f32;
|
|
|
cl_program program_mul_mm_f32_f32_l4_lm;
|
|
|
cl_program program_mul_mm_f16_f32_l4_lm;
|
|
|
|
|
|
@@ -439,6 +441,7 @@ struct ggml_backend_opencl_context {
|
|
|
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
|
|
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
|
|
cl_kernel kernel_mul_mv_q6_K_f32;
|
|
|
+ cl_kernel kernel_mul_mv_mxfp4_f32;
|
|
|
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
|
|
cl_kernel kernel_argsort_f32_i32;
|
|
|
cl_kernel kernel_sum_rows_f32;
|
|
|
@@ -455,6 +458,7 @@ struct ggml_backend_opencl_context {
|
|
|
cl_kernel kernel_conv_2d_f16_f32;
|
|
|
cl_kernel kernel_timestep_embedding;
|
|
|
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
|
|
+ cl_kernel kernel_mul_mv_id_mxfp4_f32;
|
|
|
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
|
|
|
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
|
|
|
|
|
|
@@ -577,6 +581,7 @@ struct ggml_backend_opencl_context {
|
|
|
cl_kernel kernel_transpose_32;
|
|
|
cl_kernel kernel_transpose_32_16;
|
|
|
cl_kernel kernel_transpose_16;
|
|
|
+ cl_kernel kernel_transpose_16_4x1;
|
|
|
|
|
|
cl_mem A_s_d_max; // max scale buffer size for transpose
|
|
|
cl_mem A_q_d_max; // max weight buffer size for transpose
|
|
|
@@ -971,6 +976,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
GGML_LOG_CONT(".");
|
|
|
}
|
|
|
|
|
|
+ // mul_mv_mxfp4_f32
|
|
|
+ {
|
|
|
+#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
|
+ const std::string kernel_src {
|
|
|
+ #include "mul_mv_mxfp4_f32.cl.h"
|
|
|
+ };
|
|
|
+#else
|
|
|
+ const std::string kernel_src = read_file("mul_mv_mxfp4_f32.cl");
|
|
|
+#endif
|
|
|
+ backend_ctx->program_mul_mv_mxfp4_f32 =
|
|
|
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
|
+
|
|
|
+ CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32, "kernel_mul_mv_mxfp4_f32", &err), err));
|
|
|
+ GGML_LOG_CONT(".");
|
|
|
+ }
|
|
|
+
|
|
|
// mul_mv_f16_f16
|
|
|
{
|
|
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
|
@@ -1611,6 +1632,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
GGML_LOG_CONT(".");
|
|
|
}
|
|
|
|
|
|
+ // mul_mv_id_mxfp4_f32
|
|
|
+ {
|
|
|
+#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
|
+ const std::string kernel_src {
|
|
|
+ #include "mul_mv_id_mxfp4_f32.cl.h"
|
|
|
+ };
|
|
|
+#else
|
|
|
+ const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32.cl");
|
|
|
+#endif
|
|
|
+ backend_ctx->program_mul_mv_id_mxfp4_f32 =
|
|
|
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
|
+
|
|
|
+ CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32, "kernel_mul_mv_id_mxfp4_f32", &err), err));
|
|
|
+ GGML_LOG_CONT(".");
|
|
|
+ }
|
|
|
+
|
|
|
// Adreno kernels
|
|
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
|
// transpose
|
|
|
@@ -1628,6 +1665,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
|
|
|
GGML_LOG_CONT(".");
|
|
|
}
|
|
|
|
|
|
@@ -2552,13 +2590,14 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
return true;
|
|
|
} else if (op->src[0]->type == GGML_TYPE_F32) {
|
|
|
return op->src[1]->type == GGML_TYPE_F32;
|
|
|
- } else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
|
|
|
+ } else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_MXFP4 ||
|
|
|
op->src[0]->type == GGML_TYPE_Q6_K) {
|
|
|
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
|
}
|
|
|
return false;
|
|
|
case GGML_OP_MUL_MAT_ID:
|
|
|
- if (op->src[0]->type == GGML_TYPE_Q4_0) {
|
|
|
+ if (op->src[0]->type == GGML_TYPE_Q4_0 ||
|
|
|
+ op->src[0]->type == GGML_TYPE_MXFP4) {
|
|
|
if (op->src[1]->type == GGML_TYPE_F32) {
|
|
|
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
|
}
|
|
|
@@ -2944,7 +2983,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
// cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
|
|
|
CL_CHECK(err);
|
|
|
|
|
|
- // size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
|
|
|
+ bool K_tile_trans = true;
|
|
|
+ if ((K / 32) % 4 != 0){
|
|
|
+ K_tile_trans =false;
|
|
|
+ }
|
|
|
size_t d_size_bytes = M * (K / 32) * 2;
|
|
|
region.origin = 0;
|
|
|
region.size = d_size_bytes;
|
|
|
@@ -2985,10 +3027,15 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
|
CL_CHECK(err);
|
|
|
|
|
|
- img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
|
+ if (K_tile_trans) {
|
|
|
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
|
+ img_desc_1d.image_width = M * K / 32 / 4;
|
|
|
+ } else {
|
|
|
+ img_fmt_1d = { CL_R, CL_HALF_FLOAT };
|
|
|
+ img_desc_1d.image_width = M * K / 32;
|
|
|
+ }
|
|
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
|
- img_desc_1d.image_width = M * K / 32 / 4;
|
|
|
img_desc_1d.buffer = extra->d;
|
|
|
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
|
CL_CHECK(err);
|
|
|
@@ -3024,6 +3071,10 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
int width_s = K / 32 / 4;
|
|
|
|
|
|
kernel = backend_ctx->kernel_transpose_16;
|
|
|
+ if (!K_tile_trans) {
|
|
|
+ kernel = backend_ctx->kernel_transpose_16_4x1;
|
|
|
+ width_s = K / 32;
|
|
|
+ }
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
|
|
|
@@ -6254,11 +6305,47 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
|
|
break;
|
|
|
+ case GGML_TYPE_MXFP4: {
|
|
|
+ kernel = backend_ctx->kernel_mul_mv_mxfp4_f32;
|
|
|
+
|
|
|
+ if (backend_ctx->gpu_family == INTEL) {
|
|
|
+ nth0 = 16;
|
|
|
+ nth1 = 2;
|
|
|
+ ndst = nth1*2;
|
|
|
+ } else if (backend_ctx->gpu_family == ADRENO) {
|
|
|
+ nth0 = 64;
|
|
|
+ nth1 = 2;
|
|
|
+ ndst = nth1*2;
|
|
|
+ } else {
|
|
|
+ GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
|
+ }
|
|
|
+
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne0));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne1));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float)*nth0,nullptr));
|
|
|
+ break;
|
|
|
+ }
|
|
|
default:
|
|
|
GGML_ASSERT(false && "not implemented");
|
|
|
}
|
|
|
|
|
|
- if (src0t == GGML_TYPE_Q4_0 ||
|
|
|
+ if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 ||
|
|
|
src0t == GGML_TYPE_Q4_1 ||
|
|
|
src0t == GGML_TYPE_Q8_0 ||
|
|
|
src0t == GGML_TYPE_Q2_K) {
|
|
|
@@ -6307,10 +6394,12 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
|
|
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
|
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
|
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
|
|
|
|
+ cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
|
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
|
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
|
|
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
|
@@ -6325,7 +6414,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
const int ne03 = src0->ne[3];
|
|
|
|
|
|
const cl_ulong nb00 = src0->nb[0];
|
|
|
+ const cl_ulong nb01 = src0->nb[1];
|
|
|
const cl_ulong nb02 = src0->nb[2];
|
|
|
+ const cl_ulong nb03 = src0->nb[3];
|
|
|
|
|
|
const int ne10 = src1->ne[0];
|
|
|
const int ne11 = src1->ne[1];
|
|
|
@@ -6334,6 +6425,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
|
|
|
const cl_ulong nb11 = src1->nb[1];
|
|
|
const cl_ulong nb12 = src1->nb[2];
|
|
|
+ const cl_ulong nb13 = src1->nb[3];
|
|
|
|
|
|
const int ne20 = src2->ne[0];
|
|
|
const int ne21 = src2->ne[1];
|
|
|
@@ -6401,6 +6493,49 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
|
|
|
break;
|
|
|
}
|
|
|
+ case GGML_TYPE_MXFP4: {
|
|
|
+ kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32;
|
|
|
+
|
|
|
+ if (backend_ctx->gpu_family == INTEL) {
|
|
|
+ sgs = 16;
|
|
|
+ nsg = 2;
|
|
|
+ ndst = 2;
|
|
|
+ } else if (backend_ctx->gpu_family == ADRENO) {
|
|
|
+ sgs = 64;
|
|
|
+ nsg = 2;
|
|
|
+ ndst = 2;
|
|
|
+ } else {
|
|
|
+ GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
|
+ }
|
|
|
+
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne20));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne21));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,nullptr));
|
|
|
+
|
|
|
+ break;
|
|
|
+ }
|
|
|
default:
|
|
|
GGML_ASSERT(false && "not implemented");;
|
|
|
}
|