| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547 |
- #include "convert.hpp"
- #include "dequantize.hpp"
- #include "presets.hpp"
- template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
- static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
- const sycl::nd_item<3> &item_ct1) {
- const int64_t i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
- item_ct1.get_local_id(2));
- if (i >= k) {
- return;
- }
- const int64_t ib = i/qk; // block index
- const int64_t iqs = (i%qk)/qr; // quant index
- const int64_t iybs = i - i%qk; // y block start index
- const int64_t y_offset = qr == 1 ? 1 : qk/2;
- // dequantize
- dfloat2 v;
- dequantize_kernel(vx, ib, iqs, v);
- y[iybs + iqs + 0] = v.x();
- y[iybs + iqs + y_offset] = v.y();
- }
- template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
- static void dequantize_block_sycl(const void *__restrict__ vx,
- dst_t *__restrict__ y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE);
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(
- sycl::nd_range<3>(
- sycl::range<3>(1, 1, num_blocks) *
- sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
- sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- #if QK_K == 256
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 64),
- sycl::range<3>(1, 1, 64)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q2_K(vx, y, item_ct1);
- });
- }
- #else
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q2_K(vx, y, item_ct1);
- });
- }
- #endif
- }
- template <typename dst_t>
- static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- #if QK_K == 256
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 64),
- sycl::range<3>(1, 1, 64)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q3_K(vx, y, item_ct1);
- });
- }
- #else
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q3_K(vx, y, item_ct1);
- });
- }
- #endif
- }
- template <typename dst_t>
- static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb32 = k / 32;
- const int64_t nb = (k + 255) / 256;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q4_0(vx, y, nb32, item_ct1);
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb32 = k / 32;
- const int64_t nb = (k + 255) / 256;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q4_1(vx, y, nb32, item_ct1);
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->submit([&](sycl::handler &cgh) {
- sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
- });
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- #if QK_K == 256
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 64),
- sycl::range<3>(1, 1, 64)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q5_K(vx, y, item_ct1);
- });
- }
- #else
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q5_K(vx, y, item_ct1);
- });
- }
- #endif
- }
- template <typename dst_t>
- static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- #if QK_K == 256
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 64),
- sycl::range<3>(1, 1, 64)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q6_K(vx, y, item_ct1);
- });
- }
- #else
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_q6_K(vx, y, item_ct1);
- });
- }
- #endif
- }
- template <typename dst_t>
- static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_iq1_s(
- vx, y, item_ct1, iq1s_grid_gpu
- );
- });
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_iq1_m(
- vx, y, item_ct1, iq1s_grid_gpu
- );
- });
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_iq2_xxs(
- vx, y, item_ct1, iq2xxs_grid,
- ksigns_iq2xs, kmask_iq2xs);
- });
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_iq2_xs(
- vx, y, item_ct1, iq2xs_grid,
- ksigns_iq2xs, kmask_iq2xs);
- });
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_iq2_s(vx, y, item_ct1);
- });
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_iq3_xxs(
- vx, y, item_ct1, iq3xxs_grid,
- ksigns_iq2xs, kmask_iq2xs);
- });
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = k / QK_K;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_iq3_s(
- vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
- });
- });
- }
- }
- template <typename dst_t>
- static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = (k + QK_K - 1) / QK_K;
- #if QK_K == 64
- dequantize_row_iq4_nl_sycl(vx, y, k, stream);
- #else
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_iq4_xs(vx, y, item_ct1);
- });
- });
- }
- #endif
- }
- template <typename dst_t>
- static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t nb = (k + QK_K - 1) / QK_K;
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->submit([&](sycl::handler &cgh) {
- cgh.parallel_for(
- sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
- sycl::range<3>(1, 1, 32),
- sycl::range<3>(1, 1, 32)),
- [=](sycl::nd_item<3> item_ct1) {
- dequantize_block_iq4_nl(vx, y, item_ct1);
- });
- });
- }
- }
- template <typename src_t, typename dst_t>
- static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
- const sycl::nd_item<3> &item_ct1) {
- const int64_t work_group_size = item_ct1.get_local_range(2);
- const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
- // make each work-item deal with more elements since sycl global range can not exceed max int
- const src_t * x = (src_t *) vx;
- for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
- y[i] = x[i];
- }
- }
- template <typename src_t, typename dst_t>
- static void convert_unary_sycl(const void *__restrict__ vx,
- dst_t *__restrict__ y, const int64_t k,
- dpct::queue_ptr stream) {
- const int64_t num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
- // decrease global range when it exceeds the max int
- int64_t local_size = downsample_sycl_global_range(num_blocks, SYCL_DEQUANTIZE_BLOCK_SIZE);
- sycl::range<3> block_nums(1, 1, num_blocks);
- sycl::range<3> local_range(1, 1, local_size);
- {
- dpct::has_capability_or_fail(stream->get_device(),
- {sycl::aspect::fp16});
- stream->parallel_for(
- sycl::nd_range<3>(block_nums * local_range, local_range),
- [=](sycl::nd_item<3> item_ct1) {
- convert_unary<src_t>(vx, y, k, item_ct1);
- });
- }
- }
- to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
- switch (type) {
- case GGML_TYPE_Q4_0:
- return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
- case GGML_TYPE_Q4_1:
- return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
- case GGML_TYPE_Q5_0:
- return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
- case GGML_TYPE_Q5_1:
- return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
- case GGML_TYPE_Q8_0:
- return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
- case GGML_TYPE_Q2_K:
- return dequantize_row_q2_K_sycl;
- case GGML_TYPE_Q3_K:
- return dequantize_row_q3_K_sycl;
- case GGML_TYPE_Q4_K:
- return dequantize_row_q4_K_sycl;
- case GGML_TYPE_Q5_K:
- return dequantize_row_q5_K_sycl;
- case GGML_TYPE_Q6_K:
- return dequantize_row_q6_K_sycl;
- case GGML_TYPE_IQ1_S:
- return dequantize_row_iq1_s_sycl;
- case GGML_TYPE_IQ1_M:
- return dequantize_row_iq1_m_sycl;
- case GGML_TYPE_IQ2_XXS:
- return dequantize_row_iq2_xxs_sycl;
- case GGML_TYPE_IQ2_XS:
- return dequantize_row_iq2_xs_sycl;
- case GGML_TYPE_IQ2_S:
- return dequantize_row_iq2_s_sycl;
- case GGML_TYPE_IQ3_XXS:
- return dequantize_row_iq3_xxs_sycl;
- case GGML_TYPE_IQ3_S:
- return dequantize_row_iq3_s_sycl;
- case GGML_TYPE_IQ4_XS:
- return dequantize_row_iq4_xs_sycl;
- case GGML_TYPE_IQ4_NL:
- return dequantize_row_iq4_nl_sycl;
- case GGML_TYPE_F32:
- return convert_unary_sycl<float>;
- default:
- return nullptr;
- }
- }
- to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
- switch (type) {
- case GGML_TYPE_Q4_0:
- return dequantize_row_q4_0_sycl;
- case GGML_TYPE_Q4_1:
- return dequantize_row_q4_1_sycl;
- case GGML_TYPE_Q5_0:
- return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
- case GGML_TYPE_Q5_1:
- return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
- case GGML_TYPE_Q8_0:
- return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
- case GGML_TYPE_Q2_K:
- return dequantize_row_q2_K_sycl;
- case GGML_TYPE_Q3_K:
- return dequantize_row_q3_K_sycl;
- case GGML_TYPE_Q4_K:
- return dequantize_row_q4_K_sycl;
- case GGML_TYPE_Q5_K:
- return dequantize_row_q5_K_sycl;
- case GGML_TYPE_Q6_K:
- return dequantize_row_q6_K_sycl;
- case GGML_TYPE_IQ1_S:
- return dequantize_row_iq1_s_sycl;
- case GGML_TYPE_IQ1_M:
- return dequantize_row_iq1_m_sycl;
- case GGML_TYPE_IQ2_XXS:
- return dequantize_row_iq2_xxs_sycl;
- case GGML_TYPE_IQ2_XS:
- return dequantize_row_iq2_xs_sycl;
- case GGML_TYPE_IQ2_S:
- return dequantize_row_iq2_s_sycl;
- case GGML_TYPE_IQ3_XXS:
- return dequantize_row_iq3_xxs_sycl;
- case GGML_TYPE_IQ3_S:
- return dequantize_row_iq3_s_sycl;
- case GGML_TYPE_IQ4_XS:
- return dequantize_row_iq4_xs_sycl;
- case GGML_TYPE_IQ4_NL:
- return dequantize_row_iq4_nl_sycl;
- case GGML_TYPE_F16:
- return convert_unary_sycl<sycl::half>;
- default:
- return nullptr;
- }
- }
|