|
@@ -352,7 +352,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
|
return GGML_STATUS_SUCCESS;
|
|
return GGML_STATUS_SUCCESS;
|
|
|
}
|
|
}
|
|
|
- if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) {
|
|
|
|
|
|
|
+ if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
|
|
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
|
|
tensor->extra = extra;
|
|
tensor->extra = extra;
|
|
|
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
|
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
|
@@ -2900,6 +2900,8 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
|
|
switch (type) {
|
|
switch (type) {
|
|
|
case GGML_TYPE_Q4_0:
|
|
case GGML_TYPE_Q4_0:
|
|
|
return true;
|
|
return true;
|
|
|
|
|
+ case GGML_TYPE_Q4_K:
|
|
|
|
|
+ return !g_ggml_sycl_prioritize_dmmv;
|
|
|
default:
|
|
default:
|
|
|
return false;
|
|
return false;
|
|
|
}
|
|
}
|
|
@@ -2917,6 +2919,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
|
|
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
|
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
|
|
switch (type) {
|
|
switch (type) {
|
|
|
case GGML_TYPE_Q4_0:
|
|
case GGML_TYPE_Q4_0:
|
|
|
|
|
+ case GGML_TYPE_Q4_K:
|
|
|
return true;
|
|
return true;
|
|
|
default:
|
|
default:
|
|
|
return false;
|
|
return false;
|
|
@@ -2942,16 +2945,16 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
|
|
|
|
- size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
|
|
|
- auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
|
|
|
|
|
|
+static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
|
|
|
|
|
+ dpct::queue_ptr stream) {
|
|
|
|
|
+ auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
|
|
SYCL_CHECK(
|
|
SYCL_CHECK(
|
|
|
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
|
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
|
|
.wait()));
|
|
.wait()));
|
|
|
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
|
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
|
|
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
|
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
|
|
int offset_blks = offset / sizeof(block_q4_0);
|
|
int offset_blks = offset / sizeof(block_q4_0);
|
|
|
- auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;
|
|
|
|
|
|
|
+ auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
|
|
|
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
|
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
|
|
|
|
|
|
|
stream->parallel_for(
|
|
stream->parallel_for(
|
|
@@ -2965,18 +2968,59 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
|
|
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
|
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
|
|
}
|
|
}
|
|
|
*(d_ptr + ib) = x[ib].d;
|
|
*(d_ptr + ib) = x[ib].d;
|
|
|
- });
|
|
|
|
|
|
|
+ }).wait_and_throw();
|
|
|
|
|
+
|
|
|
|
|
+ sycl::free(tmp_buf, *stream);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
|
|
|
+ GGML_ASSERT(size % sizeof(block_q4_K) == 0);
|
|
|
|
|
+ GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
|
|
|
|
|
+
|
|
|
|
|
+ const int nblocks = size / sizeof(block_q4_K);
|
|
|
|
|
+
|
|
|
|
|
+ auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
|
|
|
|
+ SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
|
|
|
|
|
+
|
|
|
|
|
+ auto * qs_ptr = data_device;
|
|
|
|
|
+ auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
|
|
|
|
|
+ auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
|
|
|
|
|
+
|
|
|
|
|
+ stream->parallel_for(nblocks, [=](auto i) {
|
|
|
|
|
+ const block_q4_K * x = (const block_q4_K *) tmp_buf;
|
|
|
|
|
+ const int ib = i;
|
|
|
|
|
+
|
|
|
|
|
+ for (int j = 0; j < QK_K / 2; ++j) {
|
|
|
|
|
+ qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
|
|
|
|
+ scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ dm_ptr[ib] = x[ib].dm;
|
|
|
|
|
+ }).wait_and_throw();
|
|
|
|
|
|
|
|
sycl::free(tmp_buf, *stream);
|
|
sycl::free(tmp_buf, *stream);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
|
- char*data_device = (char*)src0->data;
|
|
|
|
|
|
|
+ uint8_t * data_device = (uint8_t *) src0->data;
|
|
|
size_t ncols = src0->ne[0];
|
|
size_t ncols = src0->ne[0];
|
|
|
size_t nrows = src0->ne[1];
|
|
size_t nrows = src0->ne[1];
|
|
|
size_t size = ggml_nbytes(src0);
|
|
size_t size = ggml_nbytes(src0);
|
|
|
|
|
|
|
|
- reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
|
|
|
|
|
|
+ switch (src0->type) {
|
|
|
|
|
+ case GGML_TYPE_Q4_0:
|
|
|
|
|
+ reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
|
|
|
|
|
+ break;
|
|
|
|
|
+ case GGML_TYPE_Q4_K:
|
|
|
|
|
+ reorder_qw_q4_k(data_device, size, 0, stream);
|
|
|
|
|
+ break;
|
|
|
|
|
+ default:
|
|
|
|
|
+ GGML_ABORT("reorder_qw() called with unsupported type");
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
|
|
static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
|
|
@@ -3019,8 +3063,18 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
|
|
|
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
|
|
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
|
|
|
|
|
|
|
|
+static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
|
+ return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
|
|
|
|
+ src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
|
+ return ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
|
|
|
|
+ src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
|
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
|
|
int64_t min_compute_capability = INT_MAX;
|
|
int64_t min_compute_capability = INT_MAX;
|
|
|
|
|
|
|
@@ -3043,13 +3097,9 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// check data types and tensor shapes for custom matrix multiplication kernels:
|
|
// check data types and tensor shapes for custom matrix multiplication kernels:
|
|
|
- bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
|
|
|
|
|
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
|
|
|
- && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
|
|
|
|
|
|
+ bool use_dequantize_mul_mat_vec = can_use_dequantize_mul_mat_vec(src0, src1, dst);
|
|
|
|
|
|
|
|
- bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
|
|
|
|
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
|
|
|
- && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
|
|
|
|
|
+ bool use_mul_mat_vec_q = can_use_mul_mat_vec_q(src0, src1, dst);
|
|
|
|
|
|
|
|
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
|
|
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
|
|
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|