|
|
@@ -400,10 +400,10 @@ struct ggml_backend_opencl_context {
|
|
|
cl_program program_mul_mm_f32_f32_l4_lm;
|
|
|
cl_program program_mul_mm_f16_f32_l4_lm;
|
|
|
|
|
|
- cl_kernel kernel_add, kernel_add_row;
|
|
|
- cl_kernel kernel_mul, kernel_mul_row;
|
|
|
- cl_kernel kernel_div, kernel_div_row;
|
|
|
- cl_kernel kernel_sub, kernel_sub_row;
|
|
|
+ cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
|
|
|
+ cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
|
|
|
+ cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
|
|
|
+ cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
|
|
|
cl_kernel kernel_scale;
|
|
|
cl_kernel kernel_silu, kernel_silu_4;
|
|
|
cl_kernel kernel_gelu, kernel_gelu_4;
|
|
|
@@ -674,8 +674,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
backend_ctx->program_add =
|
|
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
|
|
|
|
- CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
|
|
|
- CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_add_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_f16", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_add_row_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_row_f16", &err), err));
|
|
|
GGML_LOG_CONT(".");
|
|
|
}
|
|
|
|
|
|
@@ -1089,8 +1091,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
backend_ctx->program_mul =
|
|
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
|
|
|
|
- CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
|
|
|
- CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_mul_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_f16", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_mul_row_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row_f16", &err), err));
|
|
|
GGML_LOG_CONT(".");
|
|
|
}
|
|
|
|
|
|
@@ -1288,11 +1292,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
#else
|
|
|
const std::string kernel_src = read_file("div.cl");
|
|
|
#endif
|
|
|
+ std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
|
+ " -cl-mad-enable -cl-finite-math-only ";
|
|
|
+
|
|
|
backend_ctx->program_div =
|
|
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
|
|
|
|
- CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
|
|
|
- CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_div_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err));
|
|
|
GGML_LOG_CONT(".");
|
|
|
}
|
|
|
|
|
|
@@ -1308,8 +1317,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
backend_ctx->program_sub =
|
|
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
|
|
|
|
- CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
|
|
|
- CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_sub_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_f16", &err), err));
|
|
|
+ CL_CHECK((backend_ctx->kernel_sub_row_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row_f16", &err), err));
|
|
|
GGML_LOG_CONT(".");
|
|
|
}
|
|
|
|
|
|
@@ -2447,12 +2458,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
default:
|
|
|
return false;
|
|
|
}
|
|
|
- case GGML_OP_ADD:
|
|
|
case GGML_OP_SCALE:
|
|
|
+ return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
|
|
+ case GGML_OP_ADD:
|
|
|
case GGML_OP_MUL:
|
|
|
case GGML_OP_DIV:
|
|
|
case GGML_OP_SUB:
|
|
|
- return op->src[0]->type == GGML_TYPE_F32;
|
|
|
+ return (op->src[0]->type == op->src[1]->type) &&
|
|
|
+ (op->src[0]->type == op->type) &&
|
|
|
+ (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
|
|
case GGML_OP_UNARY:
|
|
|
switch (ggml_get_unary_op(op)) {
|
|
|
case GGML_UNARY_OP_GELU:
|
|
|
@@ -3680,35 +3694,39 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
GGML_ASSERT(dst);
|
|
|
GGML_ASSERT(dst->extra);
|
|
|
|
|
|
- const int ne00 = src0 ? src0->ne[0] : 0;
|
|
|
- const int ne01 = src0 ? src0->ne[1] : 0;
|
|
|
- const int ne02 = src0 ? src0->ne[2] : 0;
|
|
|
- const int ne03 = src0 ? src0->ne[3] : 0;
|
|
|
+ GGML_ASSERT(src0->type == src1->type);
|
|
|
+ GGML_ASSERT(src0->type == dst->type);
|
|
|
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
|
|
|
|
|
- const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
|
|
- const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
|
- const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
|
- const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
|
+ const int ne00 = src0->ne[0];
|
|
|
+ const int ne01 = src0->ne[1];
|
|
|
+ const int ne02 = src0->ne[2];
|
|
|
+ const int ne03 = src0->ne[3];
|
|
|
|
|
|
- const int ne10 = src1 ? src1->ne[0] : 0;
|
|
|
- const int ne11 = src1 ? src1->ne[1] : 0;
|
|
|
- const int ne12 = src1 ? src1->ne[2] : 0;
|
|
|
- const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
|
|
|
+ const cl_ulong nb00 = src0->nb[0];
|
|
|
+ const cl_ulong nb01 = src0->nb[1];
|
|
|
+ const cl_ulong nb02 = src0->nb[2];
|
|
|
+ const cl_ulong nb03 = src0->nb[3];
|
|
|
|
|
|
- const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
|
|
|
- const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
|
|
|
- const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
|
|
|
- const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
|
|
+ const int ne10 = src1->ne[0];
|
|
|
+ const int ne11 = src1->ne[1];
|
|
|
+ const int ne12 = src1->ne[2];
|
|
|
+ const int ne13 = src1->ne[3]; UNUSED(ne13);
|
|
|
|
|
|
- const int ne0 = dst ? dst->ne[0] : 0;
|
|
|
- const int ne1 = dst ? dst->ne[1] : 0;
|
|
|
- const int ne2 = dst ? dst->ne[2] : 0;
|
|
|
- const int ne3 = dst ? dst->ne[3] : 0;
|
|
|
+ const cl_ulong nb10 = src1->nb[0];
|
|
|
+ const cl_ulong nb11 = src1->nb[1];
|
|
|
+ const cl_ulong nb12 = src1->nb[2];
|
|
|
+ const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
|
|
|
|
|
|
- const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
|
|
- const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
|
|
- const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
|
|
- const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
|
+ const int ne0 = dst->ne[0];
|
|
|
+ const int ne1 = dst->ne[1];
|
|
|
+ const int ne2 = dst->ne[2];
|
|
|
+ const int ne3 = dst->ne[3];
|
|
|
+
|
|
|
+ const cl_ulong nb0 = dst->nb[0];
|
|
|
+ const cl_ulong nb1 = dst->nb[1];
|
|
|
+ const cl_ulong nb2 = dst->nb[2];
|
|
|
+ const cl_ulong nb3 = dst->nb[3];
|
|
|
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
|
|
@@ -3731,7 +3749,12 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
|
|
|
bcast_row = true;
|
|
|
int ne = ne00 / 4;
|
|
|
- kernel = backend_ctx->kernel_add_row;
|
|
|
+
|
|
|
+ if (src0->type == GGML_TYPE_F32) {
|
|
|
+ kernel = backend_ctx->kernel_add_row;
|
|
|
+ } else {
|
|
|
+ kernel = backend_ctx->kernel_add_row_f16;
|
|
|
+ }
|
|
|
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
@@ -3741,7 +3764,11 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
|
|
} else {
|
|
|
- kernel = backend_ctx->kernel_add;
|
|
|
+ if (src0->type == GGML_TYPE_F32) {
|
|
|
+ kernel = backend_ctx->kernel_add;
|
|
|
+ } else {
|
|
|
+ kernel = backend_ctx->kernel_add_f16;
|
|
|
+ }
|
|
|
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
@@ -3803,35 +3830,39 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
GGML_ASSERT(dst);
|
|
|
GGML_ASSERT(dst->extra);
|
|
|
|
|
|
- const int ne00 = src0 ? src0->ne[0] : 0;
|
|
|
- const int ne01 = src0 ? src0->ne[1] : 0;
|
|
|
- const int ne02 = src0 ? src0->ne[2] : 0;
|
|
|
- const int ne03 = src0 ? src0->ne[3] : 0;
|
|
|
+ GGML_ASSERT(src0->type == src1->type);
|
|
|
+ GGML_ASSERT(src0->type == dst->type);
|
|
|
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
|
|
|
|
|
- const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
|
|
- const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
|
- const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
|
- const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
|
+ const int ne00 = src0->ne[0];
|
|
|
+ const int ne01 = src0->ne[1];
|
|
|
+ const int ne02 = src0->ne[2];
|
|
|
+ const int ne03 = src0->ne[3];
|
|
|
|
|
|
- const int ne10 = src1 ? src1->ne[0] : 0;
|
|
|
- const int ne11 = src1 ? src1->ne[1] : 0;
|
|
|
- const int ne12 = src1 ? src1->ne[2] : 0;
|
|
|
- const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
|
|
|
+ const cl_ulong nb00 = src0->nb[0];
|
|
|
+ const cl_ulong nb01 = src0->nb[1];
|
|
|
+ const cl_ulong nb02 = src0->nb[2];
|
|
|
+ const cl_ulong nb03 = src0->nb[3];
|
|
|
|
|
|
- const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
|
|
|
- const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
|
|
|
- const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
|
|
|
- const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
|
|
+ const int ne10 = src1->ne[0];
|
|
|
+ const int ne11 = src1->ne[1];
|
|
|
+ const int ne12 = src1->ne[2];
|
|
|
+ const int ne13 = src1->ne[3]; UNUSED(ne13);
|
|
|
+
|
|
|
+ const cl_ulong nb10 = src1->nb[0];
|
|
|
+ const cl_ulong nb11 = src1->nb[1];
|
|
|
+ const cl_ulong nb12 = src1->nb[2];
|
|
|
+ const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
|
|
|
|
|
|
- const int ne0 = dst ? dst->ne[0] : 0;
|
|
|
- const int ne1 = dst ? dst->ne[1] : 0;
|
|
|
- const int ne2 = dst ? dst->ne[2] : 0;
|
|
|
- const int ne3 = dst ? dst->ne[3] : 0;
|
|
|
+ const int ne0 = dst->ne[0];
|
|
|
+ const int ne1 = dst->ne[1];
|
|
|
+ const int ne2 = dst->ne[2];
|
|
|
+ const int ne3 = dst->ne[3];
|
|
|
|
|
|
- const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
|
|
- const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
|
|
- const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
|
|
- const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
|
+ const cl_ulong nb0 = dst->nb[0];
|
|
|
+ const cl_ulong nb1 = dst->nb[1];
|
|
|
+ const cl_ulong nb2 = dst->nb[2];
|
|
|
+ const cl_ulong nb3 = dst->nb[3];
|
|
|
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
|
|
|
|
@@ -3854,7 +3885,12 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
|
|
|
bcast_row = true;
|
|
|
int ne = ne00 / 4;
|
|
|
- kernel = backend_ctx->kernel_mul_row;
|
|
|
+
|
|
|
+ if (src0->type == GGML_TYPE_F32) {
|
|
|
+ kernel = backend_ctx->kernel_mul_row;
|
|
|
+ } else {
|
|
|
+ kernel = backend_ctx->kernel_mul_row_f16;
|
|
|
+ }
|
|
|
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
@@ -3864,7 +3900,11 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
|
|
} else {
|
|
|
- kernel = backend_ctx->kernel_mul;
|
|
|
+ if (src0->type == GGML_TYPE_F32) {
|
|
|
+ kernel = backend_ctx->kernel_mul;
|
|
|
+ } else {
|
|
|
+ kernel = backend_ctx->kernel_mul_f16;
|
|
|
+ }
|
|
|
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
@@ -3926,6 +3966,10 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
GGML_ASSERT(dst);
|
|
|
GGML_ASSERT(dst->extra);
|
|
|
|
|
|
+ GGML_ASSERT(src0->type == src1->type);
|
|
|
+ GGML_ASSERT(src0->type == dst->type);
|
|
|
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
|
|
+
|
|
|
const int ne00 = src0->ne[0];
|
|
|
const int ne01 = src0->ne[1];
|
|
|
const int ne02 = src0->ne[2];
|
|
|
@@ -3974,7 +4018,12 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
|
|
|
bcast_row = true;
|
|
|
int ne = ne00 / 4;
|
|
|
- kernel = backend_ctx->kernel_div_row;
|
|
|
+
|
|
|
+ if (src0->type == GGML_TYPE_F32) {
|
|
|
+ kernel = backend_ctx->kernel_div_row;
|
|
|
+ } else {
|
|
|
+ kernel = backend_ctx->kernel_div_row_f16;
|
|
|
+ }
|
|
|
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
@@ -3984,7 +4033,11 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
|
|
} else {
|
|
|
- kernel = backend_ctx->kernel_div;
|
|
|
+ if (src0->type == GGML_TYPE_F32) {
|
|
|
+ kernel = backend_ctx->kernel_div;
|
|
|
+ } else {
|
|
|
+ kernel = backend_ctx->kernel_div_f16;
|
|
|
+ }
|
|
|
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
@@ -4034,6 +4087,10 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
GGML_ASSERT(dst);
|
|
|
GGML_ASSERT(dst->extra);
|
|
|
|
|
|
+ GGML_ASSERT(src0->type == src1->type);
|
|
|
+ GGML_ASSERT(src0->type == dst->type);
|
|
|
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
|
|
+
|
|
|
const int ne00 = src0->ne[0];
|
|
|
const int ne01 = src0->ne[1];
|
|
|
const int ne02 = src0->ne[2];
|
|
|
@@ -4082,7 +4139,12 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
|
|
|
bcast_row = true;
|
|
|
int ne = ne00 / 4;
|
|
|
- kernel = backend_ctx->kernel_sub_row;
|
|
|
+
|
|
|
+ if (src0->type == GGML_TYPE_F32) {
|
|
|
+ kernel = backend_ctx->kernel_sub_row;
|
|
|
+ } else {
|
|
|
+ kernel = backend_ctx->kernel_sub_row_f16;
|
|
|
+ }
|
|
|
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
|
@@ -4092,7 +4154,11 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
|
|
} else {
|
|
|
- kernel = backend_ctx->kernel_sub;
|
|
|
+ if (src0->type == GGML_TYPE_F32) {
|
|
|
+ kernel = backend_ctx->kernel_sub;
|
|
|
+ } else {
|
|
|
+ kernel = backend_ctx->kernel_sub_f16;
|
|
|
+ }
|
|
|
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|