|
@@ -398,6 +398,7 @@ struct ggml_backend_opencl_context {
|
|
|
int adreno_wave_size;
|
|
int adreno_wave_size;
|
|
|
|
|
|
|
|
cl_bool non_uniform_workgroups;
|
|
cl_bool non_uniform_workgroups;
|
|
|
|
|
+ size_t image_max_buffer_size;
|
|
|
|
|
|
|
|
cl_context context;
|
|
cl_context context;
|
|
|
cl_command_queue queue;
|
|
cl_command_queue queue;
|
|
@@ -407,6 +408,10 @@ struct ggml_backend_opencl_context {
|
|
|
ggml_cl_buffer prealloc_scales_trans;
|
|
ggml_cl_buffer prealloc_scales_trans;
|
|
|
ggml_cl_buffer prealloc_act_trans;
|
|
ggml_cl_buffer prealloc_act_trans;
|
|
|
|
|
|
|
|
|
|
+ // prealloc buffers for src0 and src1
|
|
|
|
|
+ ggml_cl_buffer prealloc_src0;
|
|
|
|
|
+ ggml_cl_buffer prealloc_src1;
|
|
|
|
|
+
|
|
|
cl_program program_add;
|
|
cl_program program_add;
|
|
|
cl_program program_add_id;
|
|
cl_program program_add_id;
|
|
|
cl_program program_clamp;
|
|
cl_program program_clamp;
|
|
@@ -2658,6 +2663,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
|
|
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
|
|
|
GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
|
|
GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
|
|
|
|
|
|
|
|
|
|
+ clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
|
|
|
|
|
+ GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
|
|
|
|
|
+
|
|
|
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
|
|
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
|
|
|
GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
|
|
GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
|
|
|
|
|
|
|
@@ -4711,6 +4719,81 @@ static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct gg
|
|
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
|
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+// Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but
|
|
|
|
|
+// nb[] is recalculated such that tensor is contiguous.
|
|
|
|
|
+static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor * src, cl_mem dst,
|
|
|
|
|
+ cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) {
|
|
|
|
|
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
|
+
|
|
|
|
|
+ const int tensor_type_size = ggml_type_size(src->type);
|
|
|
|
|
+
|
|
|
|
|
+ const int ne00 = src->ne[0];
|
|
|
|
|
+ const int ne01 = src->ne[1];
|
|
|
|
|
+ const int ne02 = src->ne[2];
|
|
|
|
|
+ const int ne03 = src->ne[3];
|
|
|
|
|
+
|
|
|
|
|
+ const cl_ulong nb00 = src->nb[0];
|
|
|
|
|
+ const cl_ulong nb01 = src->nb[1];
|
|
|
|
|
+ const cl_ulong nb02 = src->nb[2];
|
|
|
|
|
+ const cl_ulong nb03 = src->nb[3];
|
|
|
|
|
+
|
|
|
|
|
+ const int ne0 = src->ne[0];
|
|
|
|
|
+ const int ne1 = src->ne[1];
|
|
|
|
|
+ const int ne2 = src->ne[2];
|
|
|
|
|
+ const int ne3 = src->ne[3];
|
|
|
|
|
+
|
|
|
|
|
+ nb0 = tensor_type_size;
|
|
|
|
|
+ nb1 = tensor_type_size*ne00;
|
|
|
|
|
+ nb2 = tensor_type_size*ne00*ne01;
|
|
|
|
|
+ nb3 = tensor_type_size*ne00*ne01*ne02;
|
|
|
|
|
+
|
|
|
|
|
+ ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *)src->extra;
|
|
|
|
|
+
|
|
|
|
|
+ cl_ulong offset0 = extra->offset + src->view_offs;
|
|
|
|
|
+ cl_ulong offsetd = 0;
|
|
|
|
|
+
|
|
|
|
|
+ cl_kernel kernel;
|
|
|
|
|
+
|
|
|
|
|
+ switch (src->type) {
|
|
|
|
|
+ case GGML_TYPE_F32:
|
|
|
|
|
+ kernel = backend_ctx->kernel_cpy_f32_f32;
|
|
|
|
|
+ break;
|
|
|
|
|
+ case GGML_TYPE_F16:
|
|
|
|
|
+ kernel = backend_ctx->kernel_cpy_f16_f16;
|
|
|
|
|
+ break;
|
|
|
|
|
+ default:
|
|
|
|
|
+ GGML_ASSERT(false && "not implemented");
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->data_device));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne2));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne3));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb0));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3));
|
|
|
|
|
+
|
|
|
|
|
+ const int nth = MIN(64, ne00);
|
|
|
|
|
+
|
|
|
|
|
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
|
|
|
+ size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
|
|
|
+
|
|
|
|
|
+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
UNUSED(backend);
|
|
UNUSED(backend);
|
|
|
UNUSED(src0);
|
|
UNUSED(src0);
|
|
@@ -7724,9 +7807,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
cl_context context = backend_ctx->context;
|
|
cl_context context = backend_ctx->context;
|
|
|
|
|
|
|
|
if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
|
|
if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
|
|
|
- if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0) {
|
|
|
|
|
|
|
+ if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0 &&
|
|
|
|
|
+ // dst is wrapped with image1d_buffer, the size limit applies, also src0
|
|
|
|
|
+ (ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4 <= backend_ctx->image_max_buffer_size)) {
|
|
|
// For KQ
|
|
// For KQ
|
|
|
if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
|
|
if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
|
|
|
|
|
+ ((nb01 * ne01 / 4)/4 <= backend_ctx->image_max_buffer_size) &&
|
|
|
nb00 <= nb02 &&
|
|
nb00 <= nb02 &&
|
|
|
nb02 <= nb01 &&
|
|
nb02 <= nb01 &&
|
|
|
nb01 <= nb03 &&
|
|
nb01 <= nb03 &&
|
|
@@ -7737,7 +7823,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
return;
|
|
return;
|
|
|
}
|
|
}
|
|
|
// For KQV
|
|
// For KQV
|
|
|
- if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
|
|
|
|
|
|
+ if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
|
|
|
|
|
+ ((nb02 * ne02 / 4)/4 <= backend_ctx->image_max_buffer_size)) {
|
|
|
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
|
|
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
|
|
|
return;
|
|
return;
|
|
|
}
|
|
}
|
|
@@ -8043,9 +8130,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
|
|
|
|
|
// GEMM using local memory
|
|
// GEMM using local memory
|
|
|
// Current BK = 16, so ne00 % 16 == 0
|
|
// Current BK = 16, so ne00 % 16 == 0
|
|
|
- if (ggml_is_contiguous(src0) &&
|
|
|
|
|
- ggml_is_contiguous(src1) &&
|
|
|
|
|
- src1t == GGML_TYPE_F32 &&
|
|
|
|
|
|
|
+ if (src1t == GGML_TYPE_F32 &&
|
|
|
ne00 % 16 == 0 &&
|
|
ne00 % 16 == 0 &&
|
|
|
ne11 > 1) {
|
|
ne11 > 1) {
|
|
|
switch(src0t) {
|
|
switch(src0t) {
|
|
@@ -8057,10 +8142,42 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
int batch_stride_b = ne10*ne11;
|
|
int batch_stride_b = ne10*ne11;
|
|
|
int batch_stride_d = ne0*ne1;
|
|
int batch_stride_d = ne0*ne1;
|
|
|
|
|
|
|
|
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
|
|
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
|
|
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
|
|
|
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
|
|
|
|
|
+ cl_mem mem_src0 = extra0->data_device;
|
|
|
|
|
+ cl_mem mem_src1 = extra1->data_device;
|
|
|
|
|
+
|
|
|
|
|
+ cl_ulong nb00_cont = nb00;
|
|
|
|
|
+ cl_ulong nb01_cont = nb01;
|
|
|
|
|
+ cl_ulong nb02_cont = nb02;
|
|
|
|
|
+ cl_ulong nb03_cont = nb03;
|
|
|
|
|
+
|
|
|
|
|
+ cl_ulong nb10_cont = nb10;
|
|
|
|
|
+ cl_ulong nb11_cont = nb11;
|
|
|
|
|
+ cl_ulong nb12_cont = nb12;
|
|
|
|
|
+ cl_ulong nb13_cont = nb13;
|
|
|
|
|
+
|
|
|
|
|
+ cl_ulong offset0_cont = offset0;
|
|
|
|
|
+ cl_ulong offset1_cont = offset1;
|
|
|
|
|
+
|
|
|
|
|
+ if (!ggml_is_contiguous(src0)) {
|
|
|
|
|
+ backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
|
|
|
|
|
+ ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
|
|
|
|
|
+ nb00_cont, nb01_cont, nb02_cont, nb03_cont);
|
|
|
|
|
+ mem_src0 = backend_ctx->prealloc_src0.buffer;
|
|
|
|
|
+ offset0_cont = 0;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (!ggml_is_contiguous(src1)) {
|
|
|
|
|
+ backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
|
|
|
|
|
+ ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
|
|
|
|
|
+ nb10_cont, nb11_cont, nb12_cont, nb13_cont);
|
|
|
|
|
+ mem_src1 = backend_ctx->prealloc_src1.buffer;
|
|
|
|
|
+ offset1_cont = 0;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
@@ -8092,10 +8209,42 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
int batch_stride_b = ne10*ne11;
|
|
int batch_stride_b = ne10*ne11;
|
|
|
int batch_stride_d = ne0*ne1;
|
|
int batch_stride_d = ne0*ne1;
|
|
|
|
|
|
|
|
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
|
|
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
|
|
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
|
|
|
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
|
|
|
|
|
+ cl_mem mem_src0 = extra0->data_device;
|
|
|
|
|
+ cl_mem mem_src1 = extra1->data_device;
|
|
|
|
|
+
|
|
|
|
|
+ cl_ulong nb00_cont = nb00;
|
|
|
|
|
+ cl_ulong nb01_cont = nb01;
|
|
|
|
|
+ cl_ulong nb02_cont = nb02;
|
|
|
|
|
+ cl_ulong nb03_cont = nb03;
|
|
|
|
|
+
|
|
|
|
|
+ cl_ulong nb10_cont = nb10;
|
|
|
|
|
+ cl_ulong nb11_cont = nb11;
|
|
|
|
|
+ cl_ulong nb12_cont = nb12;
|
|
|
|
|
+ cl_ulong nb13_cont = nb13;
|
|
|
|
|
+
|
|
|
|
|
+ cl_ulong offset0_cont = offset0;
|
|
|
|
|
+ cl_ulong offset1_cont = offset1;
|
|
|
|
|
+
|
|
|
|
|
+ if (!ggml_is_contiguous(src0)) {
|
|
|
|
|
+ backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
|
|
|
|
|
+ ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
|
|
|
|
|
+ nb00_cont, nb01_cont, nb02_cont, nb03_cont);
|
|
|
|
|
+ mem_src0 = backend_ctx->prealloc_src0.buffer;
|
|
|
|
|
+ offset0_cont = 0;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (!ggml_is_contiguous(src1)) {
|
|
|
|
|
+ backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
|
|
|
|
|
+ ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
|
|
|
|
|
+ nb10_cont, nb11_cont, nb12_cont, nb13_cont);
|
|
|
|
|
+ mem_src1 = backend_ctx->prealloc_src1.buffer;
|
|
|
|
|
+ offset1_cont = 0;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
|
|
|
|
|
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
@@ -8123,6 +8272,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
if (ne11 < 32) {
|
|
if (ne11 < 32) {
|
|
|
break;
|
|
break;
|
|
|
}
|
|
}
|
|
|
|
|
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm;
|
|
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm;
|
|
|
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
|
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
|
|
|
|
|