|
|
@@ -494,6 +494,7 @@ struct ggml_backend_opencl_context {
|
|
|
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
|
|
|
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
|
|
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
|
|
+ cl_kernel kernel_restore_block_q4_0_noshuffle;
|
|
|
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
|
|
cl_kernel kernel_mul_mv_q6_K_f32;
|
|
|
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
|
|
|
@@ -634,6 +635,7 @@ struct ggml_backend_opencl_context {
|
|
|
cl_kernel kernel_transpose_32;
|
|
|
cl_kernel kernel_transpose_32_16;
|
|
|
cl_kernel kernel_transpose_16;
|
|
|
+ cl_kernel kernel_transpose_16_buf;
|
|
|
cl_kernel kernel_transpose_16_4x1;
|
|
|
|
|
|
cl_mem A_s_d_max; // max scale buffer size for transpose
|
|
|
@@ -806,6 +808,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
|
|
|
|
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
|
|
|
@@ -2004,7 +2007,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
|
|
|
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
|
|
|
- CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
|
|
|
GGML_LOG_CONT(".");
|
|
|
}
|
|
|
|
|
|
@@ -3933,6 +3937,91 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
if (tensor->type == GGML_TYPE_Q4_0) {
|
|
|
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
|
|
|
|
|
|
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
|
+ if (use_adreno_kernels(backend_ctx, tensor)) {
|
|
|
+ cl_int err;
|
|
|
+ cl_kernel kernel;
|
|
|
+
|
|
|
+ cl_int M = tensor->ne[1]; // ne01
|
|
|
+ cl_int K = tensor->ne[0]; // ne00
|
|
|
+
|
|
|
+ GGML_ASSERT(K % 32 == 0);
|
|
|
+ GGML_ASSERT(M % 4 == 0);
|
|
|
+
|
|
|
+ size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
|
|
|
+ size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
|
|
|
+ GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
|
|
|
+
|
|
|
+ cl_mem buf_trans_q;
|
|
|
+ cl_mem buf_trans_d;
|
|
|
+
|
|
|
+ CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
|
+ size_q, NULL, &err), err));
|
|
|
+ CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
|
+ size_d, NULL, &err), err));
|
|
|
+
|
|
|
+ kernel = backend_ctx->kernel_transpose_16_buf;
|
|
|
+
|
|
|
+ // transpose q back
|
|
|
+ cl_int stride_k_q = K/4;
|
|
|
+ size_t local_size_q[3] = {64, 1, 1};
|
|
|
+ size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
|
|
|
+
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
|
|
|
+
|
|
|
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
|
+ global_size_q, local_size_q, 0, NULL, NULL));
|
|
|
+
|
|
|
+ // transpose scales back
|
|
|
+ cl_int stride_k_d = K/32;
|
|
|
+ size_t local_size_d[3] = {64, 1, 1};
|
|
|
+ size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
|
|
|
+
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
|
|
|
+
|
|
|
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
|
+ global_size_d, local_size_d, 0, NULL, NULL));
|
|
|
+
|
|
|
+ // unpack
|
|
|
+ cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
|
+ ggml_nbytes(tensor), NULL, &err);
|
|
|
+ CL_CHECK(err);
|
|
|
+
|
|
|
+ cl_uchar mask_0F = 0x0F;
|
|
|
+ cl_uchar mask_F0 = 0xF0;
|
|
|
+
|
|
|
+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
|
+ size_t local_work_size[] = {1, 1, 1};
|
|
|
+
|
|
|
+ kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
|
|
|
+
|
|
|
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
|
+ global_work_size, local_work_size, 0, NULL, NULL));
|
|
|
+
|
|
|
+ // read back to host
|
|
|
+ CL_CHECK(clEnqueueReadBuffer(
|
|
|
+ queue, data_device, CL_TRUE, offset,
|
|
|
+ size, data, 0, NULL, NULL));
|
|
|
+
|
|
|
+ CL_CHECK(clReleaseMemObject(data_device));
|
|
|
+ CL_CHECK(clReleaseMemObject(buf_trans_q));
|
|
|
+ CL_CHECK(clReleaseMemObject(buf_trans_d));
|
|
|
+
|
|
|
+ return;
|
|
|
+ }
|
|
|
+#endif
|
|
|
+
|
|
|
cl_int err;
|
|
|
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
|
ggml_nbytes(tensor), NULL, &err);
|