|
@@ -201,8 +201,7 @@ static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, co
|
|
|
{
|
|
{
|
|
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
|
|
|
|
|
|
- sycl_parallel_for(
|
|
|
|
|
- stream,
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
@@ -220,8 +219,7 @@ static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, co
|
|
|
{
|
|
{
|
|
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
|
|
|
|
|
|
- sycl_parallel_for(
|
|
|
|
|
- stream,
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
@@ -239,8 +237,7 @@ static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, co
|
|
|
{
|
|
{
|
|
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
|
|
|
|
|
|
- sycl_parallel_for(
|
|
|
|
|
- stream,
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
@@ -256,11 +253,11 @@ static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, c
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
GGML_ASSERT(ne % QK8_0 == 0);
|
|
GGML_ASSERT(ne % QK8_0 == 0);
|
|
|
const int num_blocks = ne / QK8_0;
|
|
const int num_blocks = ne / QK8_0;
|
|
|
- sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
+ [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
@@ -268,11 +265,11 @@ static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, c
|
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
const int num_blocks = ne;
|
|
const int num_blocks = ne;
|
|
|
- sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
+ [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
@@ -281,11 +278,11 @@ static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, c
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
GGML_ASSERT(ne % QK4_0 == 0);
|
|
GGML_ASSERT(ne % QK4_0 == 0);
|
|
|
const int num_blocks = ne / QK4_0;
|
|
const int num_blocks = ne / QK4_0;
|
|
|
- sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
+ [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
@@ -293,9 +290,8 @@ static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, c
|
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
const int num_blocks = ne;
|
|
const int num_blocks = ne;
|
|
|
- sycl_parallel_for(
|
|
|
|
|
- stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
|
|
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
|
|
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
|
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
|
|
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
|
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
|
|
item_ct1);
|
|
item_ct1);
|
|
@@ -308,11 +304,11 @@ static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, c
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
GGML_ASSERT(ne % QK4_1 == 0);
|
|
GGML_ASSERT(ne % QK4_1 == 0);
|
|
|
const int num_blocks = ne / QK4_1;
|
|
const int num_blocks = ne / QK4_1;
|
|
|
- sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
+ [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
@@ -320,9 +316,8 @@ static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, c
|
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
const int num_blocks = ne;
|
|
const int num_blocks = ne;
|
|
|
- sycl_parallel_for(
|
|
|
|
|
- stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
|
|
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
|
|
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
|
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
|
|
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
|
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
|
|
item_ct1);
|
|
item_ct1);
|
|
@@ -335,11 +330,11 @@ static void ggml_cpy_f32_q5_0_sycl(const char * cx, char * cdst, const int ne, c
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
GGML_ASSERT(ne % QK5_0 == 0);
|
|
GGML_ASSERT(ne % QK5_0 == 0);
|
|
|
const int num_blocks = ne / QK5_0;
|
|
const int num_blocks = ne / QK5_0;
|
|
|
- sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
+ [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
@@ -347,9 +342,8 @@ static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, c
|
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
const int num_blocks = ne;
|
|
const int num_blocks = ne;
|
|
|
- sycl_parallel_for(
|
|
|
|
|
- stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
|
|
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
|
|
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
|
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
|
|
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
|
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
|
|
item_ct1);
|
|
item_ct1);
|
|
@@ -362,11 +356,11 @@ static void ggml_cpy_f32_q5_1_sycl(const char * cx, char * cdst, const int ne, c
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
GGML_ASSERT(ne % QK5_1 == 0);
|
|
GGML_ASSERT(ne % QK5_1 == 0);
|
|
|
const int num_blocks = ne / QK5_1;
|
|
const int num_blocks = ne / QK5_1;
|
|
|
- sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
+ [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
@@ -374,9 +368,8 @@ static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, c
|
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
const int num_blocks = ne;
|
|
const int num_blocks = ne;
|
|
|
- sycl_parallel_for(
|
|
|
|
|
- stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
|
|
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
|
|
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
|
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
|
|
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
|
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
|
|
item_ct1);
|
|
item_ct1);
|
|
@@ -389,11 +382,11 @@ static void ggml_cpy_f32_iq4_nl_sycl(const char * cx, char * cdst, const int ne,
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
GGML_ASSERT(ne % QK4_NL == 0);
|
|
GGML_ASSERT(ne % QK4_NL == 0);
|
|
|
const int num_blocks = ne / QK4_NL;
|
|
const int num_blocks = ne / QK4_NL;
|
|
|
- sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
|
|
|
|
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
|
|
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
|
|
|
|
+ ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
@@ -404,8 +397,7 @@ static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, co
|
|
|
{
|
|
{
|
|
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
|
|
|
|
|
|
- sycl_parallel_for(
|
|
|
|
|
- stream,
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
@@ -424,8 +416,7 @@ static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, co
|
|
|
// dpct::has_capability_or_fail(stream->get_device(),
|
|
// dpct::has_capability_or_fail(stream->get_device(),
|
|
|
// {sycl::aspect::fp16});
|
|
// {sycl::aspect::fp16});
|
|
|
|
|
|
|
|
- sycl_parallel_for(
|
|
|
|
|
- stream,
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
@@ -444,8 +435,7 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
|
|
|
// dpct::has_capability_or_fail(stream->get_device(),
|
|
// dpct::has_capability_or_fail(stream->get_device(),
|
|
|
// {sycl::aspect::fp16});
|
|
// {sycl::aspect::fp16});
|
|
|
|
|
|
|
|
- sycl_parallel_for(
|
|
|
|
|
- stream,
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
[=](sycl::nd_item<3> item_ct1) {
|
|
@@ -460,13 +450,11 @@ static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const
|
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
|
- sycl_parallel_for(stream,
|
|
|
|
|
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
|
|
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
|
|
|
|
- ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
|
|
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
|
|
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@@ -475,13 +463,11 @@ static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const
|
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
|
- sycl_parallel_for(stream,
|
|
|
|
|
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
|
|
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
|
|
|
|
- ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
|
|
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
|
|
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@@ -491,13 +477,11 @@ static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
|
|
|
|
|
|
- sycl_parallel_for(stream,
|
|
|
|
|
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
|
|
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
|
|
|
|
- ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
|
|
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
|
|
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@@ -506,13 +490,10 @@ static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const
|
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
|
- sycl_parallel_for(stream,
|
|
|
|
|
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
|
|
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
|
|
|
|
- ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
|
|
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@@ -522,13 +503,10 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const
|
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
const int nb12, const int nb13, queue_ptr stream) {
|
|
|
|
|
|
|
|
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
|
- sycl_parallel_for(stream,
|
|
|
|
|
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
|
|
|
- sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
|
|
|
|
- [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
- cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
|
|
|
|
|
- ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ stream->parallel_for(
|
|
|
|
|
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
|
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|